[llvm] 6b9a9cf - [msan][test] Add another target("aarch64.svcount") test case (#164343)

Tue Oct 21 18:14:25 PDT 2025

Author: Thurston Dang
Date: 2025-10-21T18:14:21-07:00
New Revision: 6b9a9cf040d33ad7f9cd563a907b13e373313255

URL: https://github.com/llvm/llvm-project/commit/6b9a9cf040d33ad7f9cd563a907b13e373313255
DIFF: https://github.com/llvm/llvm-project/commit/6b9a9cf040d33ad7f9cd563a907b13e373313255.diff

LOG: [msan][test] Add another target("aarch64.svcount") test case (#164343)

This shows a crash that happens because MSan tries to check the shadow
of a target("aarch64.svcount")-sized argument.

This is the followup to
https://github.com/llvm/llvm-project/pull/164315. This also does a
drive-by fix of those test cases, to remove FileCheck (otherwise, even
if opt passed, the test would still XFAIL because FileCheck cannot find
any CHECK: assertions).

Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll

Added: 
    llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
    llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll

Modified: 
    llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
    llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
index 1ddcd4b56688c..1c869bd41b931 100644

--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s
 
 ; XFAIL: *
 

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
index 9caa89de63748..00cf3204464d0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s
 
 ; XFAIL: *
 

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
new file mode 100644
index 0000000000000..3f43efa233621
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+; Manually reduced to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+  %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %2 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %1, ptr %ptr)
+  ret void
+}

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
new file mode 100644
index 0000000000000..cd04373c11d20
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
@@ -0,0 +1,340 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm)
+  ret void
+}
+
+define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm)
+  ret void
+}
+
+
+define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm)
+  ret void
+}
+
+define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm)
+  ret void
+}
+
+
+define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  ret void
+}
+
+
+define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  ret void
+}
+
+
+
+define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
+  ret void
+}
+
+define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
+  %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %5)
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %6)
+  ret void
+}
+
+
+define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+  %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+  %4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 2
+  %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 0
+  %8 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 1
+  %9 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 2
+  %10 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 0
+  %13 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 1
+  %14 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 2
+  %15 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 0
+  %18 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 1
+  %19 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 2
+  %20 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 3
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %2, <vscale x 4 x float> %7, <vscale x 4 x float> %12, <vscale x 4 x float> %17)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %3, <vscale x 4 x float> %8, <vscale x 4 x float> %13, <vscale x 4 x float> %18)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %4, <vscale x 4 x float> %9, <vscale x 4 x float> %14, <vscale x 4 x float> %19)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %5, <vscale x 4 x float> %10, <vscale x 4 x float> %15, <vscale x 4 x float> %20)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
+  ret void
+}
+
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) sanitize_memory {
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
+  @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
+  <vscale x 16 x i8> %zm)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) sanitize_memory {
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
+  @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
+  <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) sanitize_memory {
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
+  @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
+  <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) sanitize_memory {
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
+  @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
+  <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) sanitize_memory {
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
+  @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
+  <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
+  <vscale x 16 x i8> %zm)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) sanitize_memory {
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
+  @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
+  <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
+  <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) sanitize_memory {
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
+  @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
+  <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
+  <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) sanitize_memory {
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
+  @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
+  <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
+  <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+declare void at llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare void at llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void at llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void at llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void at llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
+declare void at llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)