[clang] 48fb8ee - [Clang][SME2] Add multi-vector add/sub builtins (#69725)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Nov 7 07:42:47 PST 2023
Author: Kerry McLaughlin
Date: 2023-11-07T15:42:43Z
New Revision: 48fb8ee081587b4c8e96473d6c7db6b4a83c8dc1
URL: https://github.com/llvm/llvm-project/commit/48fb8ee081587b4c8e96473d6c7db6b4a83c8dc1
DIFF: https://github.com/llvm/llvm-project/commit/48fb8ee081587b4c8e96473d6c7db6b4a83c8dc1.diff
LOG: [Clang][SME2] Add multi-vector add/sub builtins (#69725)
Adds the following SME2 builtins:
- sv(add|sub)
- sv(add|sub)_za32/za64,
- sv(add|sub)_write_za32/za64
Other changes in this patch:
- CGBuiltin.cpp: The GetAArch64SMEProcessedOperands function is created
to avoid duplicating existing code from EmitAArch64SVEBuiltinExpr.
- arm_sve.td: The add/sub SME2 builtins which do not operate on ZA have
been added to arm_sve.td, matching the corrosponding LLVM IR intrinsic
names which start with @llvm.aarch64.sve for this reason.
- SveEmitter.cpp: Adds the createCoreHeaderIntrinsics function to remove
duplicated code in createHeader & createSMEHeader. Uses a new enum
(ACLEKind) to choose either "__builtin_sme_" or "__builtin_sve_" when
emitting the intrinsics.
See https://github.com/ARM-software/acle/pull/217/files
Added:
clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
Modified:
clang/include/clang/Basic/arm_sme.td
clang/include/clang/Basic/arm_sve.td
clang/include/clang/Basic/arm_sve_sme_incl.td
clang/lib/CodeGen/CodeGenFunction.h
clang/utils/TableGen/SveEmitter.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8d85327a86b1aaf..b5655afdf419ecf 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -263,3 +263,38 @@ multiclass ZAFPOuterProd<string n_suffix> {
defm SVMOPA : ZAFPOuterProd<"mopa">;
defm SVMOPS : ZAFPOuterProd<"mops">;
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - ADD, SUB
+
+multiclass ZAAddSub<string n_suffix> {
+ let TargetGuard = "sme2" in {
+ def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+ def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+ def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+ let TargetGuard = "sme-i16i64" in {
+ def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+ def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+ def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+ }
+
+ let TargetGuard = "sme-f64f64" in {
+ def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+ def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+ }
+ }
+}
+
+defm SVADD : ZAAddSub<"add">;
+defm SVSUB : ZAAddSub<"sub">;
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b5baafedd139602..3d4c2129565903d 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1980,3 +1980,15 @@ def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sv
defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">;
}
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2
+
+// SME intrinsics which operate only on vectors and do not require ZA should be added here,
+// as they could possibly become SVE instructions in the future.
+
+let TargetGuard = "sme2" in {
+// == ADD (vectors) ==
+ def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
+ def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>;
+}
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 3a7a5b51b25801e..22a2a3c5434d657 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -257,7 +257,7 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
}
class Inst<string n, string p, string t, MergeType mt, string i,
- list<FlagType> ft, list<ImmCheck> ch, MemEltType met> {
+ list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {
string Name = n;
string Prototype = p;
string Types = t;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ee2c4b1e10afba5..42f94c9b540191e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4297,6 +4297,7 @@ class CodeGenFunction : public CodeGenTypeCache {
/// the wider vector. This avoids the error when allocating space in llvm
/// for struct of scalable vectors if a function returns struct.
llvm::Value *FormSVEBuiltinResult(llvm::Value *Call);
+
llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
new file mode 100644
index 000000000000000..dd96dca70d63705
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -0,0 +1,649 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single-Multi
+//
+
+// x2
+// CHECK-LABEL: @test_svadd_write_single2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svadd_write_single4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_single4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi-Multi
+//
+
+// x2
+
+// CHECK-LABEL: @test_svadd_write_multi2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svadd_write_multi4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_write_multi4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Accumulate to ZA
+//
+
+// x2
+
+// CHECK-LABEL: @test_svadd_za32_vg1x2_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za32_vg1x2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn);
+}
+
+// CHECK-LABEL: @test_svadd_za32_vg1x2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x2_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svadd_za32_vg1x4_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za32_vg1x4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za32_vg1x4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x4_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svadd_za64_vg1x4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
new file mode 100644
index 000000000000000..9570deab0b39198
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
@@ -0,0 +1,649 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single-Multi
+//
+
+// x2
+// CHECK-LABEL: @test_svsub_write_single2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s32j11svint32x2_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s64j11svint64x2_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svsub_write_single4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s32j11svint32x4_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s64j11svint64x4_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_single4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi-Multi
+//
+
+// x2
+
+// CHECK-LABEL: @test_svsub_write_multi2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s32j11svint32x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s64j11svint64x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svsub_write_multi4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s32j11svint32x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZM]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s64j11svint64x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsub_write_multi4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 2)
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZM]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Accumulate to ZA
+//
+
+// x2
+
+// CHECK-LABEL: @test_svsub_za32_vg1x2_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za32_vg1x2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_s32j11svint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn);
+}
+
+// CHECK-LABEL: @test_svsub_za32_vg1x2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x2_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_s64j11svint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn);
+}
+
+// x4
+
+// CHECK-LABEL: @test_svsub_za32_vg1x4_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za32_vg1x4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_s32j11svint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za32_vg1x4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x4_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_s64j11svint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn);
+}
+
+// CHECK-LABEL: @test_svsub_za64_vg1x4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za {
+ SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
new file mode 100644
index 000000000000000..85c4b9b09546b38
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
@@ -0,0 +1,539 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi-Single Vector
+//
+
+// x2
+
+// CHECK-LABEL: @test_svadd_vector_single2_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP6]]
+//
+svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP6]]
+//
+svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP6]]
+//
+svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP6]]
+//
+svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single2_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP6]]
+//
+svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm);
+}
+
+
+// x4
+
+// CHECK-LABEL: @test_svadd_vector_single4_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 32)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 32)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP12]]
+//
+svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 16)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 32)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 16)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 32)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP12]]
+//
+svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP12]]
+//
+svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP12]]
+//
+svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_s32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP12]]
+//
+svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP12]]
+//
+svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_s64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP12]]
+//
+svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm);
+}
+
+// CHECK-LABEL: @test_svadd_vector_single4_u64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 2)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 4)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZN]], i64 6)
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP12]]
+//
+svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) __arm_streaming {
+ return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 6c82368c3c7229c..d00989ac0f3beb5 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -44,6 +44,8 @@ enum ClassKind {
ClassG, // Overloaded name without type suffix
};
+enum class ACLEKind { SVE, SME };
+
using TypeSpec = std::string;
namespace {
@@ -249,7 +251,7 @@ class Intrinsic {
}
/// Emits the intrinsic declaration to the ostream.
- void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const;
+ void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, ACLEKind Kind) const;
private:
std::string getMergeSuffix() const { return MergeSuffix; }
@@ -351,6 +353,10 @@ class SVEEmitter {
/// Emit arm_sve.h.
void createHeader(raw_ostream &o);
+ // Emits core intrinsics in both arm_sme.h and arm_sve.h
+ void createCoreHeaderIntrinsics(raw_ostream &o, SVEEmitter &Emitter,
+ ACLEKind Kind);
+
/// Emit all the __builtin prototypes and code needed by Sema.
void createBuiltins(raw_ostream &o);
@@ -1035,7 +1041,8 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const {
getMergeSuffix();
}
-void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const {
+void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter,
+ ACLEKind Kind) const {
bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1;
std::string FullName = mangleName(ClassS);
@@ -1052,9 +1059,17 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const {
SMEAttrs += ", arm_preserves_za";
OS << (IsOverloaded ? "__aio " : "__ai ")
- << "__attribute__((__clang_arm_builtin_alias("
- << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_")
- << FullName << ")";
+ << "__attribute__((__clang_arm_builtin_alias(";
+
+ switch (Kind) {
+ case ACLEKind::SME:
+ OS << "__builtin_sme_" << FullName << ")";
+ break;
+ case ACLEKind::SVE:
+ OS << "__builtin_sve_" << FullName << ")";
+ break;
+ }
+
if (!SMEAttrs.empty())
OS << SMEAttrs;
OS << "))\n";
@@ -1192,6 +1207,34 @@ void SVEEmitter::createIntrinsic(
}
}
+void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS,
+ SVEEmitter &Emitter,
+ ACLEKind Kind) {
+ SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+ std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+ for (auto *R : RV)
+ createIntrinsic(R, Defs);
+
+ // Sort intrinsics in header file by following order/priority:
+ // - Architectural guard (i.e. does it require SVE2 or SVE2_AES)
+ // - Class (is intrinsic overloaded or not)
+ // - Intrinsic name
+ std::stable_sort(Defs.begin(), Defs.end(),
+ [](const std::unique_ptr<Intrinsic> &A,
+ const std::unique_ptr<Intrinsic> &B) {
+ auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
+ return std::make_tuple(I->getGuard(),
+ (unsigned)I->getClassKind(),
+ I->getName());
+ };
+ return ToTuple(A) < ToTuple(B);
+ });
+
+ // Actually emit the intrinsic declarations.
+ for (auto &I : Defs)
+ I->emitIntrinsic(OS, Emitter, Kind);
+}
+
void SVEEmitter::createHeader(raw_ostream &OS) {
OS << "/*===---- arm_sve.h - ARM SVE intrinsics "
"-----------------------------------===\n"
@@ -1350,27 +1393,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
}
}
- SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
- std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
- for (auto *R : RV)
- createIntrinsic(R, Defs);
-
- // Sort intrinsics in header file by following order/priority:
- // - Architectural guard (i.e. does it require SVE2 or SVE2_AES)
- // - Class (is intrinsic overloaded or not)
- // - Intrinsic name
- std::stable_sort(
- Defs.begin(), Defs.end(), [](const std::unique_ptr<Intrinsic> &A,
- const std::unique_ptr<Intrinsic> &B) {
- auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
- return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName());
- };
- return ToTuple(A) < ToTuple(B);
- });
-
- // Actually emit the intrinsic declarations.
- for (auto &I : Defs)
- I->emitIntrinsic(OS, *this);
+ createCoreHeaderIntrinsics(OS, *this, ACLEKind::SVE);
OS << "#define svcvtnt_bf16_x svcvtnt_bf16_m\n";
OS << "#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m\n";
@@ -1560,30 +1583,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
OS << "extern \"C\" {\n";
OS << "#endif\n\n";
- SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
- std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
- for (auto *R : RV)
- createIntrinsic(R, Defs);
-
- // Sort intrinsics in header file by following order/priority similar to SVE:
- // - Architectural guard
- // - Class (is intrinsic overloaded or not)
- // - Intrinsic name
- std::stable_sort(Defs.begin(), Defs.end(),
- [](const std::unique_ptr<Intrinsic> &A,
- const std::unique_ptr<Intrinsic> &B) {
- auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
- return std::make_tuple(I->getGuard(),
- (unsigned)I->getClassKind(),
- I->getName());
- };
- return ToTuple(A) < ToTuple(B);
- });
-
- // Actually emit the intrinsic declaration.
- for (auto &I : Defs) {
- I->emitIntrinsic(OS, *this);
- }
+ createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME);
OS << "#ifdef __cplusplus\n";
OS << "} // extern \"C\"\n";
More information about the cfe-commits
mailing list