[llvm] 411bfe4 - [ARM] Add and update a lot of VLDn tests. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 19 10:55:25 PST 2019


Author: David Green
Date: 2019-11-19T18:37:30Z
New Revision: 411bfe476b758c09a0c9d4b3176e46f0a70de3bb

URL: https://github.com/llvm/llvm-project/commit/411bfe476b758c09a0c9d4b3176e46f0a70de3bb
DIFF: https://github.com/llvm/llvm-project/commit/411bfe476b758c09a0c9d4b3176e46f0a70de3bb.diff

LOG: [ARM] Add and update a lot of VLDn tests. NFC

Added: 
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll

Modified: 
    llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
new file mode 100644
index 000000000000..28a7c75421f9
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -0,0 +1,990 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vld2_v2i32(<4 x i32> *%src, <2 x i32> *%dst) {
+; CHECK-LABEL: vld2_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    strd r2, r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x i32>, <4 x i32>* %src, align 4
+  %s1 = shufflevector <4 x i32> %l1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x i32> %l1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %a = add <2 x i32> %s1, %s2
+  store <2 x i32> %a, <2 x i32> *%dst
+  ret void
+}
+
+define void @vld2_v4i32(<8 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vld2_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vadd.i32 q0, q1, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i32>, <8 x i32>* %src, align 4
+  %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = add <4 x i32> %s1, %s2
+  store <4 x i32> %a, <4 x i32> *%dst
+  ret void
+}
+
+define void @vld2_v8i32(<16 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vld2_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vadd.i32 q0, q0, q3
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s11, s6
+; CHECK-NEXT:    vadd.i32 q1, q2, q3
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i32>, <16 x i32>* %src, align 4
+  %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = add <8 x i32> %s1, %s2
+  store <8 x i32> %a, <8 x i32> *%dst
+  ret void
+}
+
+define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vld2_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vadd.i32 q0, q1, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s11, s6
+; CHECK-NEXT:    vadd.i32 q1, q2, q3
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vadd.i32 q2, q2, q5
+; CHECK-NEXT:    vmov.f32 s20, s17
+; CHECK-NEXT:    vmov.f32 s21, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vadd.i32 q3, q4, q5
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i32>, <32 x i32>* %src, align 4
+  %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %a = add <16 x i32> %s1, %s2
+  store <16 x i32> %a, <16 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vld2_v2i16(<4 x i16> *%src, <2 x i16> *%dst) {
+; CHECK-LABEL: vld2_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1, #2]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x i16>, <4 x i16>* %src, align 4
+  %s1 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+  %a = add <2 x i16> %s1, %s2
+  store <2 x i16> %a, <2 x i16> *%dst
+  ret void
+}
+
+define void @vld2_v4i16(<8 x i16> *%src, <4 x i16> *%dst) {
+; CHECK-LABEL: vld2_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i16>, <8 x i16>* %src, align 4
+  %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = add <4 x i16> %s1, %s2
+  store <4 x i16> %a, <4 x i16> *%dst
+  ret void
+}
+
+define void @vld2_v8i16(<16 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vld2_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i16>, <16 x i16>* %src, align 4
+  %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = add <8 x i16> %s1, %s2
+  store <8 x i16> %a, <8 x i16> *%dst
+  ret void
+}
+
+define void @vld2_v16i16(<32 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vld2_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vadd.i16 q1, q1, q4
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i16>, <32 x i16>* %src, align 4
+  %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %a = add <16 x i16> %s1, %s2
+  store <16 x i16> %a, <16 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vld2_v2i8(<4 x i8> *%src, <2 x i8> *%dst) {
+; CHECK-LABEL: vld2_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r0]
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1, #1]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x i8>, <4 x i8>* %src, align 4
+  %s1 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+  %a = add <2 x i8> %s1, %s2
+  store <2 x i8> %a, <2 x i8> *%dst
+  ret void
+}
+
+define void @vld2_v4i8(<8 x i8> *%src, <4 x i8> *%dst) {
+; CHECK-LABEL: vld2_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrb.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i8>, <8 x i8>* %src, align 4
+  %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = add <4 x i8> %s1, %s2
+  store <4 x i8> %a, <4 x i8> *%dst
+  ret void
+}
+
+define void @vld2_v8i8(<16 x i8> *%src, <8 x i8> *%dst) {
+; CHECK-LABEL: vld2_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i8>, <16 x i8>* %src, align 4
+  %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = add <8 x i8> %s1, %s2
+  store <8 x i8> %a, <8 x i8> *%dst
+  ret void
+}
+
+define void @vld2_v16i8(<32 x i8> *%src, <16 x i8> *%dst) {
+; CHECK-LABEL: vld2_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.8 q0[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.8 q0[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.8 q0[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    vmov.8 q0[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.8 q0[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[13]
+; CHECK-NEXT:    vmov.8 q0[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.8 q0[7], r2
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.8 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.8 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q3[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q3[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.8 q3[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q3[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.8 q3[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q3[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q3[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q3[15], r0
+; CHECK-NEXT:    vadd.i8 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i8>, <32 x i8>* %src, align 4
+  %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %a = add <16 x i8> %s1, %s2
+  store <16 x i8> %a, <16 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
+; CHECK-LABEL: vld2_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r12, s7
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %l1 = load <4 x i64>, <4 x i64>* %src, align 4
+  %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+  %a = add <2 x i64> %s1, %s2
+  store <2 x i64> %a, <2 x i64> *%dst
+  ret void
+}
+
+define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
+; CHECK-LABEL: vld2_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d8, d7
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s21
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov r12, s19
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.32 q3[2], lr
+; CHECK-NEXT:    vmov.32 q3[3], r12
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    adc.w r12, r2, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %l1 = load <8 x i64>, <8 x i64>* %src, align 4
+  %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = add <4 x i64> %s1, %s2
+  store <4 x i64> %a, <4 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vld2_v2f32(<4 x float> *%src, <2 x float> *%dst) {
+; CHECK-LABEL: vld2_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vstmia r1, {s0, s1}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x float>, <4 x float>* %src, align 4
+  %s1 = shufflevector <4 x float> %l1, <4 x float> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x float> %l1, <4 x float> undef, <2 x i32> <i32 1, i32 3>
+  %a = fadd <2 x float> %s1, %s2
+  store <2 x float> %a, <2 x float> *%dst
+  ret void
+}
+
+define void @vld2_v4f32(<8 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vld2_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vadd.f32 q0, q1, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x float>, <8 x float>* %src, align 4
+  %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = fadd <4 x float> %s1, %s2
+  store <4 x float> %a, <4 x float> *%dst
+  ret void
+}
+
+define void @vld2_v8f32(<16 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vld2_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vadd.f32 q0, q0, q3
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s11, s6
+; CHECK-NEXT:    vadd.f32 q1, q2, q3
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x float>, <16 x float>* %src, align 4
+  %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = fadd <8 x float> %s1, %s2
+  store <8 x float> %a, <8 x float> *%dst
+  ret void
+}
+
+define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vld2_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vadd.f32 q0, q1, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vmov.f32 s11, s6
+; CHECK-NEXT:    vadd.f32 q1, q2, q3
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vadd.f32 q2, q2, q5
+; CHECK-NEXT:    vmov.f32 s20, s17
+; CHECK-NEXT:    vmov.f32 s21, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vadd.f32 q3, q4, q5
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x float>, <32 x float>* %src, align 4
+  %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %a = fadd <16 x float> %s1, %s2
+  store <16 x float> %a, <16 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
+; CHECK-LABEL: vld2_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrd r2, r0, [r0]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x half>, <4 x half>* %src, align 4
+  %s1 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 1, i32 3>
+  %a = fadd <2 x half> %s1, %s2
+  store <2 x half> %a, <2 x half> *%dst
+  ret void
+}
+
+define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
+; CHECK-LABEL: vld2_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vadd.f16 q0, q1, q2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    strd r0, r2, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x half>, <8 x half>* %src, align 4
+  %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = fadd <4 x half> %s1, %s2
+  store <4 x half> %a, <4 x half> *%dst
+  ret void
+}
+
+define void @vld2_v8f16(<16 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vld2_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmov.16 q0[1], r3
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x half>, <16 x half>* %src, align 4
+  %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = fadd <8 x half> %s1, %s2
+  store <8 x half> %a, <8 x half> *%dst
+  ret void
+}
+
+define void @vld2_v16f16(<32 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vld2_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmov.16 q0[1], r3
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r3
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vadd.f16 q1, q0, q3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q3, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x half>, <32 x half>* %src, align 4
+  %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %a = fadd <16 x half> %s1, %s2
+  store <16 x half> %a, <16 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vld2_v2f64(<4 x double> *%src, <2 x double> *%dst) {
+; CHECK-LABEL: vld2_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vadd.f64 d1, d0, d1
+; CHECK-NEXT:    vadd.f64 d0, d2, d3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <4 x double>, <4 x double>* %src, align 4
+  %s1 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 0, i32 2>
+  %s2 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 1, i32 3>
+  %a = fadd <2 x double> %s1, %s2
+  store <2 x double> %a, <2 x double> *%dst
+  ret void
+}
+
+define void @vld2_v4f64(<8 x double> *%src, <4 x double> *%dst) {
+; CHECK-LABEL: vld2_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d1, d0, d1
+; CHECK-NEXT:    vadd.f64 d0, d2, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d3, d2, d3
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vadd.f64 d2, d4, d5
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x double>, <8 x double>* %src, align 4
+  %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = fadd <4 x double> %s1, %s2
+  store <4 x double> %a, <4 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
new file mode 100644
index 000000000000..f23f73fd3cd9
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -0,0 +1,1843 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
+; CHECK-LABEL: vld3_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s14, s8
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    strd r2, r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x i32>, <6 x i32>* %src, align 4
+  %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = add <2 x i32> %s1, %s2
+  %a = add <2 x i32> %a1, %s3
+  store <2 x i32> %a, <2 x i32> *%dst
+  ret void
+}
+
+define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vld3_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vadd.i32 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x i32>, <12 x i32>* %src, align 4
+  %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = add <4 x i32> %s1, %s2
+  %a = add <4 x i32> %a1, %s3
+  store <4 x i32> %a, <4 x i32> *%dst
+  ret void
+}
+
+define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vld3_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r2, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r2, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.i32 q0, q3, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s14, s12
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <24 x i32>, <24 x i32>* %src, align 4
+  %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %a1 = add <8 x i32> %s1, %s2
+  %a = add <8 x i32> %a1, %s3
+  store <8 x i32> %a, <8 x i32> *%dst
+  ret void
+}
+
+define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vld3_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r2, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r2, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.i32 q0, q3, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.32 r2, q3[2]
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.32 r2, q3[1]
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vdup.32 q6, r2
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s14, s12
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.32 r2, q4[2]
+; CHECK-NEXT:    vdup.32 q6, r2
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.32 r2, q4[1]
+; CHECK-NEXT:    vmov.f64 d12, d6
+; CHECK-NEXT:    vdup.32 q7, r2
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
+; CHECK-NEXT:    vmov.f32 s25, s15
+; CHECK-NEXT:    vmov.f32 s26, s10
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s27, s31
+; CHECK-NEXT:    vmov.f32 s18, s16
+; CHECK-NEXT:    vadd.i32 q5, q6, q5
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vmov.f32 s11, s19
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vadd.i32 q2, q5, q2
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s24, s17
+; CHECK-NEXT:    vmov.32 r0, q0[2]
+; CHECK-NEXT:    vmov.f32 s25, s20
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.32 r0, q0[1]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmov.f32 s27, s31
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vmov.f32 s20, s18
+; CHECK-NEXT:    vmov.f32 s15, s31
+; CHECK-NEXT:    vmov.f32 s2, s0
+; CHECK-NEXT:    vadd.i32 q6, q3, q6
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vadd.i32 q0, q6, q5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <48 x i32>, <48 x i32>* %src, align 4
+  %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %a1 = add <16 x i32> %s1, %s2
+  %a = add <16 x i32> %a1, %s3
+  store <16 x i32> %a, <16 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
+; CHECK-LABEL: vld3_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    ldr r2, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    str r2, [sp]
+; CHECK-NEXT:    vldrh.u32 q2, [r3]
+; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s14, s8
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1, #2]
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x i16>, <6 x i16>* %src, align 4
+  %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = add <2 x i16> %s1, %s2
+  %a = add <2 x i16> %a1, %s3
+  store <2 x i16> %a, <2 x i16> *%dst
+  ret void
+}
+
+define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
+; CHECK-LABEL: vld3_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vadd.i32 q0, q1, q2
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x i16>, <12 x i16>* %src, align 4
+  %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = add <4 x i16> %s1, %s2
+  %a = add <4 x i16> %a1, %s3
+  store <4 x i16> %a, <4 x i16> *%dst
+  ret void
+}
+
+define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vld3_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vadd.i16 q3, q3, q5
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <24 x i16>, <24 x i16>* %src, align 4
+  %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %a1 = add <8 x i16> %s1, %s2
+  %a = add <8 x i16> %a1, %s3
+  store <8 x i16> %a, <8 x i16> *%dst
+  ret void
+}
+
+define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vld3_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vadd.i16 q3, q3, q5
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[1]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[2]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[3]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[4]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vadd.i16 q4, q6, q4
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vadd.i16 q1, q4, q2
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <48 x i16>, <48 x i16>* %src, align 4
+  %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %a1 = add <16 x i16> %s1, %s2
+  %a = add <16 x i16> %a1, %s3
+  store <16 x i16> %a, <16 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) {
+; CHECK-LABEL: vld3_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    ldrd r2, r0, [r0]
+; CHECK-NEXT:    strd r2, r0, [sp]
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1, #1]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x i8>, <6 x i8>* %src, align 4
+  %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = add <2 x i8> %s1, %s2
+  %a = add <2 x i8> %a1, %s3
+  store <2 x i8> %a, <2 x i8> *%dst
+  ret void
+}
+
+define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
+; CHECK-LABEL: vld3_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vldrb.u16 q2, [r0]
+; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    str r3, [sp]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vldrb.u16 q2, [r2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vadd.i32 q1, q1, q3
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vstrb.32 q0, [r1]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x i8>, <12 x i8>* %src, align 4
+  %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = add <4 x i8> %s1, %s2
+  %a = add <4 x i8> %a1, %s3
+  store <4 x i8> %a, <4 x i8> *%dst
+  ret void
+}
+
+define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
+; CHECK-LABEL: vld3_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.16 q2[5], r2
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q2, q3
+; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <24 x i8>, <24 x i8>* %src, align 4
+  %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %a1 = add <8 x i8> %s1, %s2
+  %a = add <8 x i8> %a1, %s3
+  store <8 x i8> %a, <8 x i8> *%dst
+  ret void
+}
+
+define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
+; CHECK-LABEL: vld3_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.8 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.8 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov.8 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    vmov.8 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    vmov.8 q2[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.8 q2[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[2]
+; CHECK-NEXT:    vmov.8 q2[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.8 q2[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[8]
+; CHECK-NEXT:    vmov.8 q2[8], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.8 q2[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[14]
+; CHECK-NEXT:    vmov.8 q2[10], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[0]
+; CHECK-NEXT:    vmov.8 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[1]
+; CHECK-NEXT:    vmov.8 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[2]
+; CHECK-NEXT:    vmov.8 q3[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[3]
+; CHECK-NEXT:    vmov.8 q3[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[4]
+; CHECK-NEXT:    vmov.8 q3[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[5]
+; CHECK-NEXT:    vmov.8 q3[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[6]
+; CHECK-NEXT:    vmov.8 q3[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vmov.8 q3[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[8]
+; CHECK-NEXT:    vmov.8 q3[8], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[9]
+; CHECK-NEXT:    vmov.8 q3[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[10]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.8 q3[10], r2
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[11]
+; CHECK-NEXT:    vmov.8 q3[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[12]
+; CHECK-NEXT:    vmov.8 q3[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[13]
+; CHECK-NEXT:    vmov.8 q3[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[14]
+; CHECK-NEXT:    vmov.8 q3[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.8 q6[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.8 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.8 q6[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q6[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.8 q6[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q6[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q6[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q6[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q6[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.8 q6[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q6[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q6[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[15]
+; CHECK-NEXT:    vmov.8 q3[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.8 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.8 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vadd.i8 q3, q3, q5
+; CHECK-NEXT:    vmov.u8 r0, q4[0]
+; CHECK-NEXT:    vmov.8 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[1]
+; CHECK-NEXT:    vmov.8 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[2]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[3]
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[4]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[5]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[6]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[7]
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[8]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[9]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q1[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q1[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q1[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.8 q1[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q1[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q1[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vadd.i8 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <48 x i8>, <48 x i8>* %src, align 4
+  %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %a1 = add <16 x i8> %s1, %s2
+  %a = add <16 x i8> %a1, %s3
+  store <16 x i8> %a, <16 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
+; CHECK-LABEL: vld3_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s16
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s15, s17
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov r12, s15
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, lr, r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %l1 = load <6 x i64>, <6 x i64>* %src, align 4
+  %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = add <2 x i64> %s1, %s2
+  %a = add <2 x i64> %a1, %s3
+  store <2 x i64> %a, <2 x i64> *%dst
+  ret void
+}
+
+define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
+; CHECK-LABEL: vld3_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f64 d4, d0
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s9, s1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vmov.f64 d14, d11
+; CHECK-NEXT:    vmov.f32 s29, s23
+; CHECK-NEXT:    vmov.f32 s30, s0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vmov r3, s30
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vmov.f32 s14, s16
+; CHECK-NEXT:    vmov.f32 s11, s19
+; CHECK-NEXT:    vmov.f32 s15, s17
+; CHECK-NEXT:    vmov.f64 d8, d12
+; CHECK-NEXT:    vmov.f32 s17, s25
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmov r12, s31
+; CHECK-NEXT:    vmov r2, s23
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, lr, r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov r3, s29
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adcs r0, r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    adds.w lr, r3, r4
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    adc.w r12, r0, r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %l1 = load <12 x i64>, <12 x i64>* %src, align 4
+  %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = add <4 x i64> %s1, %s2
+  %a = add <4 x i64> %a1, %s3
+  store <4 x i64> %a, <4 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
+; CHECK-LABEL: vld3_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldr s1, [r0, #16]
+; CHECK-NEXT:    vldr s5, [r0, #20]
+; CHECK-NEXT:    vmov.f64 d6, d4
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vadd.f32 q0, q3, q0
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vstmia r1, {s0, s1}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x float>, <6 x float>* %src, align 4
+  %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = fadd <2 x float> %s1, %s2
+  %a = fadd <2 x float> %a1, %s3
+  store <2 x float> %a, <2 x float> *%dst
+  ret void
+}
+
+define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vld3_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.f32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vadd.f32 q0, q3, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x float>, <12 x float>* %src, align 4
+  %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = fadd <4 x float> %s1, %s2
+  %a = fadd <4 x float> %a1, %s3
+  store <4 x float> %a, <4 x float> *%dst
+  ret void
+}
+
+define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vld3_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r2, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r2, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.f32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f32 q0, q3, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s14, s12
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <24 x float>, <24 x float>* %src, align 4
+  %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %a1 = fadd <8 x float> %s1, %s2
+  %a = fadd <8 x float> %a1, %s3
+  store <8 x float> %a, <8 x float> *%dst
+  ret void
+}
+
+define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vld3_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.32 r2, q2[2]
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.32 r2, q2[1]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vadd.f32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f32 q0, q3, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.32 r2, q3[2]
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.32 r2, q3[1]
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vdup.32 q6, r2
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s14, s12
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.32 r2, q4[2]
+; CHECK-NEXT:    vdup.32 q6, r2
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.32 r2, q4[1]
+; CHECK-NEXT:    vmov.f64 d12, d6
+; CHECK-NEXT:    vdup.32 q7, r2
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
+; CHECK-NEXT:    vmov.f32 s25, s15
+; CHECK-NEXT:    vmov.f32 s26, s10
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s27, s31
+; CHECK-NEXT:    vmov.f32 s18, s16
+; CHECK-NEXT:    vadd.f32 q5, q6, q5
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vmov.f32 s11, s19
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vadd.f32 q2, q5, q2
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s24, s17
+; CHECK-NEXT:    vmov.32 r0, q0[2]
+; CHECK-NEXT:    vmov.f32 s25, s20
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.32 r0, q0[1]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmov.f32 s27, s31
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vmov.f32 s20, s18
+; CHECK-NEXT:    vmov.f32 s15, s31
+; CHECK-NEXT:    vmov.f32 s2, s0
+; CHECK-NEXT:    vadd.f32 q6, q3, q6
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vadd.f32 q0, q6, q5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <48 x float>, <48 x float>* %src, align 4
+  %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %a1 = fadd <16 x float> %s1, %s2
+  %a = fadd <16 x float> %a1, %s3
+  store <16 x float> %a, <16 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
+; CHECK-LABEL: vld3_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrd r2, r3, [r0]
+; CHECK-NEXT:    ldr r0, [r0, #8]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vadd.f16 q1, q2, q1
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vadd.f16 q0, q1, q0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x half>, <6 x half>* %src, align 4
+  %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = fadd <2 x half> %s1, %s2
+  %a = fadd <2 x half> %a1, %s3
+  store <2 x half> %a, <2 x half> *%dst
+  ret void
+}
+
+define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
+; CHECK-LABEL: vld3_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vmov.16 q2[0], r3
+; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vadd.f16 q2, q3, q2
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vadd.f16 q0, q2, q0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    strd r0, r2, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x half>, <12 x half>* %src, align 4
+  %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = fadd <4 x half> %s1, %s2
+  %a = fadd <4 x half> %a1, %s3
+  store <4 x half> %a, <4 x half> *%dst
+  ret void
+}
+
+define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vld3_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmov.16 q3[0], r3
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmovx.f16 s16, s9
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s2
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vadd.f16 q3, q4, q3
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q3, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <24 x half>, <24 x half>* %src, align 4
+  %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %a1 = fadd <8 x half> %s1, %s2
+  %a = fadd <8 x half> %a1, %s3
+  store <8 x half> %a, <8 x half> *%dst
+  ret void
+}
+
+define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vld3_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov.16 q3[1], r3
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.16 q4[1], r3
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmovx.f16 s20, s2
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s9
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vadd.f16 q3, q3, q4
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.16 q1[0], r3
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f16 q3, q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmovx.f16 s20, s6
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s1
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmovx.f16 s20, s2
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vadd.f16 q3, q3, q4
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q3, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <48 x half>, <48 x half>* %src, align 4
+  %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %a1 = fadd <16 x half> %s1, %s2
+  %a = fadd <16 x half> %a1, %s3
+  store <16 x half> %a, <16 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
+; CHECK-LABEL: vld3_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vadd.f64 d4, d3, d0
+; CHECK-NEXT:    vadd.f64 d5, d6, d7
+; CHECK-NEXT:    vadd.f64 d1, d4, d1
+; CHECK-NEXT:    vadd.f64 d0, d5, d2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <6 x double>, <6 x double>* %src, align 4
+  %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3>
+  %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4>
+  %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5>
+  %a1 = fadd <2 x double> %s1, %s2
+  %a = fadd <2 x double> %a1, %s3
+  store <2 x double> %a, <2 x double> *%dst
+  ret void
+}
+
+define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
+; CHECK-LABEL: vld3_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d5, d6, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vadd.f64 d4, d1, d2
+; CHECK-NEXT:    vadd.f64 d10, d9, d6
+; CHECK-NEXT:    vadd.f64 d11, d12, d13
+; CHECK-NEXT:    vadd.f64 d3, d4, d3
+; CHECK-NEXT:    vadd.f64 d2, d5, d0
+; CHECK-NEXT:    vadd.f64 d1, d10, d7
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vadd.f64 d0, d11, d8
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <12 x double>, <12 x double>* %src, align 4
+  %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %a1 = fadd <4 x double> %s1, %s2
+  %a = fadd <4 x double> %a1, %s3
+  store <4 x double> %a, <4 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
new file mode 100644
index 000000000000..1ff07600fe77
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -0,0 +1,2230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) {
+; CHECK-LABEL: vld4_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s18, s9
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    add.w r12, r2, r0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strd r0, r12, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i32>, <8 x i32>* %src, align 4
+  %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = add <2 x i32> %s1, %s2
+  %a2 = add <2 x i32> %s3, %s4
+  %a3 = add <2 x i32> %a1, %a2
+  store <2 x i32> %a3, <2 x i32> *%dst
+  ret void
+}
+
+define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vld4_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vadd.i32 q0, q0, q5
+; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i32>, <16 x i32>* %src, align 4
+  %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = add <4 x i32> %s1, %s2
+  %a2 = add <4 x i32> %s3, %s4
+  %a3 = add <4 x i32> %a1, %a2
+  store <4 x i32> %a3, <4 x i32> *%dst
+  ret void
+}
+
+define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vld4_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vadd.i32 q0, q0, q5
+; CHECK-NEXT:    vmov.f64 d12, d3
+; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s22, s19
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s25, s14
+; CHECK-NEXT:    vmov.f32 s20, s7
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.f32 s26, s18
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s27, s10
+; CHECK-NEXT:    vadd.i32 q5, q6, q5
+; CHECK-NEXT:    vmov.f32 s26, s17
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vmov.f32 s25, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vadd.i32 q1, q1, q6
+; CHECK-NEXT:    vadd.i32 q1, q1, q5
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i32>, <32 x i32>* %src, align 4
+  %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = add <8 x i32> %s1, %s2
+  %a2 = add <8 x i32> %s3, %s4
+  %a3 = add <8 x i32> %a1, %a2
+  store <8 x i32> %a3, <8 x i32> *%dst
+  ret void
+}
+
+define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vld4_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #112]
+; CHECK-NEXT:    vadd.i32 q0, q0, q5
+; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d3
+; CHECK-NEXT:    vmov.f32 s22, s19
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.f32 s26, s17
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s20, s7
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s1, s14
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #144]
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s25, s13
+; CHECK-NEXT:    vadd.i32 q5, q0, q5
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #176]
+; CHECK-NEXT:    vadd.i32 q0, q1, q6
+; CHECK-NEXT:    vadd.i32 q1, q0, q5
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #160]
+; CHECK-NEXT:    vmov.f64 d0, d5
+; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s23, s14
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vmov.f32 s27, s15
+; CHECK-NEXT:    vmov.f32 s31, s13
+; CHECK-NEXT:    vmov.f32 s24, s11
+; CHECK-NEXT:    vmov.f32 s28, s9
+; CHECK-NEXT:    vmov.f32 s1, s18
+; CHECK-NEXT:    vmov.f32 s10, s20
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vmov.f32 s11, s12
+; CHECK-NEXT:    vmov.f32 s3, s14
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vmov.f32 s25, s19
+; CHECK-NEXT:    vmov.f32 s29, s17
+; CHECK-NEXT:    vadd.i32 q6, q0, q6
+; CHECK-NEXT:    vmov.f32 s9, s16
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vadd.i32 q0, q2, q7
+; CHECK-NEXT:    vmov.f64 d10, d7
+; CHECK-NEXT:    vadd.i32 q2, q0, q6
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s30, s27
+; CHECK-NEXT:    vmov.f32 s31, s19
+; CHECK-NEXT:    vmov.f32 s21, s2
+; CHECK-NEXT:    vmov.f32 s28, s15
+; CHECK-NEXT:    vmov.f32 s27, s18
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vmov.f32 s29, s3
+; CHECK-NEXT:    vmov.f32 s23, s18
+; CHECK-NEXT:    vadd.i32 q7, q5, q7
+; CHECK-NEXT:    vmov.f32 s22, s25
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s14, s24
+; CHECK-NEXT:    vmov.f32 s15, s16
+; CHECK-NEXT:    vmov.f32 s21, s1
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vadd.i32 q0, q3, q5
+; CHECK-NEXT:    vadd.i32 q0, q0, q7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <64 x i32>, <64 x i32>* %src, align 4
+  %s1 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %s2 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %s3 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %s4 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  %a1 = add <16 x i32> %s1, %s2
+  %a2 = add <16 x i32> %s3, %s4
+  %a3 = add <16 x i32> %a1, %a2
+  store <16 x i32> %a3, <16 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vld4_v2i16(<8 x i16> *%src, <2 x i16> *%dst) {
+; CHECK-LABEL: vld4_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1, #2]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i16>, <8 x i16>* %src, align 4
+  %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = add <2 x i16> %s1, %s2
+  %a2 = add <2 x i16> %s3, %s4
+  %a3 = add <2 x i16> %a1, %a2
+  store <2 x i16> %a3, <2 x i16> *%dst
+  ret void
+}
+
+define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
+; CHECK-LABEL: vld4_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vadd.i32 q2, q3, q2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vadd.i32 q0, q3, q4
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i16>, <16 x i16>* %src, align 4
+  %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = add <4 x i16> %s1, %s2
+  %a2 = add <4 x i16> %s3, %s4
+  %a3 = add <4 x i16> %a1, %a2
+  store <4 x i16> %a3, <4 x i16> *%dst
+  ret void
+}
+
+define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vld4_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vadd.i16 q4, q5, q4
+; CHECK-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q7[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q7[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q7[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q7[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[1]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[2]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q5, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i16>, <32 x i16>* %src, align 4
+  %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = add <8 x i16> %s1, %s2
+  %a2 = add <8 x i16> %s3, %s4
+  %a3 = add <8 x i16> %a1, %a2
+  store <8 x i16> %a3, <8 x i16> *%dst
+  ret void
+}
+
+define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vld4_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[4]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[5]
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vadd.i16 q4, q5, q4
+; CHECK-NEXT:    vmov.u16 r2, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q7[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q7[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q7[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q7[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[0]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[1]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[2]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[3]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[1]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vadd.i16 q0, q5, q0
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q7[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q7[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q7[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q7[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q7[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q7[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q7[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q7[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vadd.i16 q5, q5, q7
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q1, q6
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i16 q0, q0, q5
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <64 x i16>, <64 x i16>* %src, align 4
+  %s1 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %s2 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %s3 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %s4 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  %a1 = add <16 x i16> %s1, %s2
+  %a2 = add <16 x i16> %s3, %s4
+  %a3 = add <16 x i16> %a1, %a2
+  store <16 x i16> %a3, <16 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vld4_v2i8(<8 x i8> *%src, <2 x i8> *%dst) {
+; CHECK-LABEL: vld4_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1, #1]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i8>, <8 x i8>* %src, align 4
+  %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = add <2 x i8> %s1, %s2
+  %a2 = add <2 x i8> %s3, %s4
+  %a3 = add <2 x i8> %a1, %a2
+  store <2 x i8> %a3, <2 x i8> *%dst
+  ret void
+}
+
+define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
+; CHECK-LABEL: vld4_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vrev32.8 q2, q0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vrev16.8 q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrb.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i8>, <16 x i8>* %src, align 4
+  %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = add <4 x i8> %s1, %s2
+  %a2 = add <4 x i8> %s3, %s4
+  %a3 = add <4 x i8> %a1, %a2
+  store <4 x i8> %a3, <4 x i8> *%dst
+  ret void
+}
+
+define void @vld4_v8i8(<32 x i8> *%src, <8 x i8> *%dst) {
+; CHECK-LABEL: vld4_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vadd.i16 q2, q3, q2
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vadd.i16 q0, q3, q4
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
+; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i8>, <32 x i8>* %src, align 4
+  %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = add <8 x i8> %s1, %s2
+  %a2 = add <8 x i8> %s3, %s4
+  %a3 = add <8 x i8> %a1, %a2
+  store <8 x i8> %a3, <8 x i8> *%dst
+  ret void
+}
+
+define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
+; CHECK-LABEL: vld4_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.8 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.8 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.8 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.8 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.8 q2[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.8 q2[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.8 q2[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.8 q2[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[0]
+; CHECK-NEXT:    vmov.8 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[1]
+; CHECK-NEXT:    vmov.8 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[2]
+; CHECK-NEXT:    vmov.8 q4[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[3]
+; CHECK-NEXT:    vmov.8 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[4]
+; CHECK-NEXT:    vmov.8 q4[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[5]
+; CHECK-NEXT:    vmov.8 q4[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[6]
+; CHECK-NEXT:    vmov.8 q4[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.8 q4[7], r2
+; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.u8 r2, q2[3]
+; CHECK-NEXT:    vmov.8 q5[8], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vmov.8 q5[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[11]
+; CHECK-NEXT:    vmov.8 q5[10], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[15]
+; CHECK-NEXT:    vmov.8 q5[11], r2
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[7]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[11]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[8]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[9]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[10]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[11]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[12]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[13]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[14]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[15]
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.8 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.8 q6[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.8 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q6[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q6[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q6[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q6[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q6[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q6[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q6[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q6[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[10]
+; CHECK-NEXT:    vmov.8 q6[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.8 q6[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.8 q6[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.8 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q6[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.8 q6[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q6[7], r0
+; CHECK-NEXT:    vadd.i8 q4, q5, q4
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.8 q6[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.8 q6[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q6[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q6[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q6[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[8]
+; CHECK-NEXT:    vmov.8 q6[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[12]
+; CHECK-NEXT:    vmov.8 q6[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q7[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q7[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.8 q7[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.8 q7[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q7[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.8 q7[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.8 q7[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q7[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[0]
+; CHECK-NEXT:    vmov.8 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[1]
+; CHECK-NEXT:    vmov.8 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[2]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[3]
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[4]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[5]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[6]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[7]
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q1[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q1[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.8 q1[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q1[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q1[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q1[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[9]
+; CHECK-NEXT:    vmov.8 q1[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[13]
+; CHECK-NEXT:    vmov.8 q1[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vadd.i8 q0, q5, q0
+; CHECK-NEXT:    vadd.i8 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <64 x i8>, <64 x i8>* %src, align 4
+  %s1 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %s2 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %s3 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %s4 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  %a1 = add <16 x i8> %s1, %s2
+  %a2 = add <16 x i8> %s3, %s4
+  %a3 = add <16 x i8> %a1, %a2
+  store <16 x i8> %a3, <16 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
+; CHECK-LABEL: vld4_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d8, d7
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s15, s21
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov r12, s19
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    vmov r4, s13
+; CHECK-NEXT:    adcs r0, r3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r4, r3
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %l1 = load <8 x i64>, <8 x i64>* %src, align 4
+  %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = add <2 x i64> %s1, %s2
+  %a2 = add <2 x i64> %s3, %s4
+  %a3 = add <2 x i64> %a1, %a2
+  store <2 x i64> %a3, <2 x i64> *%dst
+  ret void
+}
+
+define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
+; CHECK-LABEL: vld4_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #72
+; CHECK-NEXT:    sub sp, #72
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d8, d3
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT:    vmov.f64 d12, d11
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d4, d15
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s9, s31
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s30, s0
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov r12, s11
+; CHECK-NEXT:    vmov r2, s31
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    adcs r0, r2
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    vmov r3, s27
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r4, r3
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    vmov r6, s20
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    adc.w r8, r3, r2
+; CHECK-NEXT:    vmov r2, s25
+; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    adds r3, r3, r7
+; CHECK-NEXT:    vmov r7, s28
+; CHECK-NEXT:    adcs r4, r6
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov r5, s29
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov.32 q0[3], r8
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    adds r6, r6, r7
+; CHECK-NEXT:    adcs r4, r5
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    adds r0, r0, r6
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #72
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+entry:
+  %l1 = load <16 x i64>, <16 x i64>* %src, align 4
+  %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = add <4 x i64> %s1, %s2
+  %a2 = add <4 x i64> %s3, %s4
+  %a3 = add <4 x i64> %a1, %a2
+  store <4 x i64> %a3, <4 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) {
+; CHECK-LABEL: vld4_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s8, s7
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s13, s2
+; CHECK-NEXT:    vadd.f32 q2, q3, q2
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s13, s1
+; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vadd.f32 q0, q1, q3
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    vstmia r1, {s0, s1}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x float>, <8 x float>* %src, align 4
+  %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = fadd <2 x float> %s1, %s2
+  %a2 = fadd <2 x float> %s3, %s4
+  %a3 = fadd <2 x float> %a1, %a2
+  store <2 x float> %a3, <2 x float> *%dst
+  ret void
+}
+
+define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vld4_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vadd.f32 q0, q0, q5
+; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x float>, <16 x float>* %src, align 4
+  %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = fadd <4 x float> %s1, %s2
+  %a2 = fadd <4 x float> %s3, %s4
+  %a3 = fadd <4 x float> %a1, %a2
+  store <4 x float> %a3, <4 x float> *%dst
+  ret void
+}
+
+define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vld4_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vadd.f32 q0, q0, q5
+; CHECK-NEXT:    vmov.f64 d12, d3
+; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s22, s19
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s25, s14
+; CHECK-NEXT:    vmov.f32 s20, s7
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.f32 s26, s18
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s27, s10
+; CHECK-NEXT:    vadd.f32 q5, q6, q5
+; CHECK-NEXT:    vmov.f32 s26, s17
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vmov.f32 s25, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vadd.f32 q1, q1, q6
+; CHECK-NEXT:    vadd.f32 q1, q1, q5
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x float>, <32 x float>* %src, align 4
+  %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = fadd <8 x float> %s1, %s2
+  %a2 = fadd <8 x float> %s3, %s4
+  %a3 = fadd <8 x float> %a1, %a2
+  store <8 x float> %a3, <8 x float> *%dst
+  ret void
+}
+
+define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vld4_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f64 d10, d1
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s16, s3
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s23, s6
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s22, s13
+; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s3, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #112]
+; CHECK-NEXT:    vadd.f32 q0, q0, q5
+; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d3
+; CHECK-NEXT:    vmov.f32 s22, s19
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.f32 s26, s17
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s20, s7
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s1, s14
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #144]
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s25, s13
+; CHECK-NEXT:    vadd.f32 q5, q0, q5
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #176]
+; CHECK-NEXT:    vadd.f32 q0, q1, q6
+; CHECK-NEXT:    vadd.f32 q1, q0, q5
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #160]
+; CHECK-NEXT:    vmov.f64 d0, d5
+; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s23, s14
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vmov.f32 s27, s15
+; CHECK-NEXT:    vmov.f32 s31, s13
+; CHECK-NEXT:    vmov.f32 s24, s11
+; CHECK-NEXT:    vmov.f32 s28, s9
+; CHECK-NEXT:    vmov.f32 s1, s18
+; CHECK-NEXT:    vmov.f32 s10, s20
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vmov.f32 s11, s12
+; CHECK-NEXT:    vmov.f32 s3, s14
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vmov.f32 s25, s19
+; CHECK-NEXT:    vmov.f32 s29, s17
+; CHECK-NEXT:    vadd.f32 q6, q0, q6
+; CHECK-NEXT:    vmov.f32 s9, s16
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vadd.f32 q0, q2, q7
+; CHECK-NEXT:    vmov.f64 d10, d7
+; CHECK-NEXT:    vadd.f32 q2, q0, q6
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s30, s27
+; CHECK-NEXT:    vmov.f32 s31, s19
+; CHECK-NEXT:    vmov.f32 s21, s2
+; CHECK-NEXT:    vmov.f32 s28, s15
+; CHECK-NEXT:    vmov.f32 s27, s18
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vmov.f32 s29, s3
+; CHECK-NEXT:    vmov.f32 s23, s18
+; CHECK-NEXT:    vadd.f32 q7, q5, q7
+; CHECK-NEXT:    vmov.f32 s22, s25
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s14, s24
+; CHECK-NEXT:    vmov.f32 s15, s16
+; CHECK-NEXT:    vmov.f32 s21, s1
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vadd.f32 q0, q3, q5
+; CHECK-NEXT:    vadd.f32 q0, q0, q7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <64 x float>, <64 x float>* %src, align 4
+  %s1 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %s2 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %s3 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %s4 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  %a1 = fadd <16 x float> %s1, %s2
+  %a2 = fadd <16 x float> %s3, %s4
+  %a3 = fadd <16 x float> %a1, %a2
+  store <16 x float> %a3, <16 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
+; CHECK-LABEL: vld4_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vadd.f16 q1, q2, q1
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x half>, <8 x half>* %src, align 4
+  %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = fadd <2 x half> %s1, %s2
+  %a2 = fadd <2 x half> %s3, %s4
+  %a3 = fadd <2 x half> %a1, %a2
+  store <2 x half> %a3, <2 x half> *%dst
+  ret void
+}
+
+define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
+; CHECK-LABEL: vld4_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q2[1], r3
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmovx.f16 s16, s3
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vadd.f16 q0, q1, q3
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    strd r0, r2, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x half>, <16 x half>* %src, align 4
+  %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = fadd <4 x half> %s1, %s2
+  %a2 = fadd <4 x half> %s3, %s4
+  %a3 = fadd <4 x half> %a1, %a2
+  store <4 x half> %a3, <4 x half> *%dst
+  ret void
+}
+
+define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vld4_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov.16 q4[1], r3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmovx.f16 s24, s1
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmovx.f16 s24, s3
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s9
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s13
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s15
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s0
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vadd.f16 q4, q4, q5
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s6
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q1, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x half>, <32 x half>* %src, align 4
+  %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = fadd <8 x half> %s1, %s2
+  %a2 = fadd <8 x half> %s3, %s4
+  %a3 = fadd <8 x half> %a1, %a2
+  store <8 x half> %a3, <8 x half> *%dst
+  ret void
+}
+
+define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vld4_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov.16 q4[1], r3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmovx.f16 s24, s1
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q5[1], r3
+; CHECK-NEXT:    vmovx.f16 s24, s3
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s9
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s13
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s15
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s0
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vadd.f16 q4, q4, q5
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s6
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q5[1], r3
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.16 q1[1], r3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q1, q5
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vadd.f16 q4, q0, q4
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vmovx.f16 s20, s9
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s15
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s1
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s3
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vadd.f16 q4, q5, q4
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmovx.f16 s12, s14
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vadd.f16 q0, q5, q3
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <64 x half>, <64 x half>* %src, align 4
+  %s1 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %s2 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %s3 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %s4 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  %a1 = fadd <16 x half> %s1, %s2
+  %a2 = fadd <16 x half> %s3, %s4
+  %a3 = fadd <16 x half> %a1, %a2
+  store <16 x half> %a3, <16 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) {
+; CHECK-LABEL: vld4_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
+; CHECK-NEXT:    vadd.f64 d1, d2, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d2, d2, d3
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vadd.f64 d1, d1, d0
+; CHECK-NEXT:    vadd.f64 d0, d3, d2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x double>, <8 x double>* %src, align 4
+  %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 3, i32 7>
+  %a1 = fadd <2 x double> %s1, %s2
+  %a2 = fadd <2 x double> %s3, %s4
+  %a3 = fadd <2 x double> %a1, %a2
+  store <2 x double> %a3, <2 x double> *%dst
+  ret void
+}
+
+define void @vld4_v4f64(<16 x double> *%src, <4 x double> *%dst) {
+; CHECK-LABEL: vld4_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vadd.f64 d1, d2, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vadd.f64 d2, d2, d3
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vadd.f64 d4, d4, d5
+; CHECK-NEXT:    vadd.f64 d5, d6, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d6, d6, d7
+; CHECK-NEXT:    vadd.f64 d7, d8, d9
+; CHECK-NEXT:    vadd.f64 d1, d1, d0
+; CHECK-NEXT:    vadd.f64 d0, d3, d2
+; CHECK-NEXT:    vadd.f64 d3, d5, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vadd.f64 d2, d7, d6
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x double>, <16 x double>* %src, align 4
+  %s1 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = fadd <4 x double> %s1, %s2
+  %a2 = fadd <4 x double> %s3, %s4
+  %a3 = fadd <4 x double> %a1, %a2
+  store <4 x double> %a3, <4 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
new file mode 100644
index 000000000000..a8073cf26203
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -0,0 +1,1002 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vst2_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrd r2, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
+  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
+  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
+  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
+  %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %s, <4 x i32> *%dst
+  ret void
+}
+
+define void @vst2_v4i32(<4 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vst2_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d3
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vmov.f32 s9, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
+  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
+  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
+  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
+  %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %s, <8 x i32> *%dst
+  ret void
+}
+
+define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vst2_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s17
+; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s0, s18
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s2, s19
+; CHECK-NEXT:    vmov.f64 d8, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s18, s9
+; CHECK-NEXT:    vmov.f32 s19, s5
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
+  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
+  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
+  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
+  %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %s, <16 x i32> *%dst
+  ret void
+}
+
+define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) {
+; CHECK-LABEL: vst2_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d14, d0
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s29, s16
+; CHECK-NEXT:    vmov.f32 s30, s1
+; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d14, d10
+; CHECK-NEXT:    vmov.f32 s29, s4
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vmov.f32 s31, s5
+; CHECK-NEXT:    vmov.f32 s4, s22
+; CHECK-NEXT:    vstrw.32 q7, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s23
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s16, s2
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vmov.f32 s22, s25
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s8, s26
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s10, s27
+; CHECK-NEXT:    vmov.f64 d12, d0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s25, s12
+; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vmov.f32 s27, s13
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vstrw.32 q6, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s13, s14
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
+  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
+  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
+  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
+  %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i32> %s, <32 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) {
+; CHECK-LABEL: vst2_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrh r3, [r0]
+; CHECK-NEXT:    ldrh r2, [r0, #4]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    ldrh.w r12, [r0, #6]
+; CHECK-NEXT:    ldrh r0, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
+  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
+  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
+  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
+  %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i16> %s, <4 x i16> *%dst
+  ret void
+}
+
+define void @vst2_v4i16(<4 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vst2_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    vmovnt.i32 q1, q0
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
+  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
+  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
+  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
+  %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i16> %s, <8 x i16> *%dst
+  ret void
+}
+
+define void @vst2_v8i16(<8 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vst2_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
+  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
+  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
+  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
+  %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i16> %s, <16 x i16> *%dst
+  ret void
+}
+
+define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) {
+; CHECK-LABEL: vst2_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[1]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
+  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
+  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
+  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
+  %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i16> %s, <32 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) {
+; CHECK-LABEL: vst2_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrb r2, [r0]
+; CHECK-NEXT:    ldrb r3, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrb.w r12, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    ldrb r0, [r0, #3]
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vstrb.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
+  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
+  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
+  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
+  %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i8> %s, <4 x i8> *%dst
+  ret void
+}
+
+define void @vst2_v4i8(<4 x i8> *%src, <8 x i8> *%dst) {
+; CHECK-LABEL: vst2_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
+; CHECK-NEXT:    vldrb.u32 q1, [r0]
+; CHECK-NEXT:    vmovnt.i32 q1, q0
+; CHECK-NEXT:    vstrb.16 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
+  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
+  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
+  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
+  %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i8> %s, <8 x i8> *%dst
+  ret void
+}
+
+define void @vst2_v8i8(<8 x i8> *%src, <16 x i8> *%dst) {
+; CHECK-LABEL: vst2_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
+; CHECK-NEXT:    vldrb.u16 q1, [r0]
+; CHECK-NEXT:    vmovnt.i16 q1, q0
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
+  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
+  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
+  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
+  %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %s, <16 x i8> *%dst
+  ret void
+}
+
+define void @vst2_v16i8(<16 x i8> *%src, <32 x i8> *%dst) {
+; CHECK-LABEL: vst2_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov.8 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q3[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.8 q3[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.8 q3[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q3[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.8 q3[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q3[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.8 q3[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.8 q3[15], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
+  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
+  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
+  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
+  %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i8> %s, <32 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
+; CHECK-LABEL: vst2_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vmov.f32 s10, s0
+; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    vmov.f32 s1, s7
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
+  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
+  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
+  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
+  %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i64> %s, <4 x i64> *%dst
+  ret void
+}
+
+define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
+; CHECK-LABEL: vst2_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov.f64 d10, d3
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vstrb.8 q1, [r0], #48
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vstrw.32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
+  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
+  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
+  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
+  %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i64> %s, <8 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vst2_v2f32(<2 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vst2_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vldr s4, [r0, #4]
+; CHECK-NEXT:    vldr s1, [r0, #8]
+; CHECK-NEXT:    vldr s5, [r0, #12]
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
+  %l1 = load <2 x float>, <2 x float>* %s1, align 4
+  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
+  %l2 = load <2 x float>, <2 x float>* %s2, align 4
+  %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x float> %s, <4 x float> *%dst
+  ret void
+}
+
+define void @vst2_v4f32(<4 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vst2_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d3
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vmov.f32 s9, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
+  %l1 = load <4 x float>, <4 x float>* %s1, align 4
+  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
+  %l2 = load <4 x float>, <4 x float>* %s2, align 4
+  %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %s, <8 x float> *%dst
+  ret void
+}
+
+define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vst2_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s17
+; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s0, s18
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s2, s19
+; CHECK-NEXT:    vmov.f64 d8, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s18, s9
+; CHECK-NEXT:    vmov.f32 s19, s5
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
+  %l1 = load <8 x float>, <8 x float>* %s1, align 4
+  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
+  %l2 = load <8 x float>, <8 x float>* %s2, align 4
+  %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x float> %s, <16 x float> *%dst
+  ret void
+}
+
+define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) {
+; CHECK-LABEL: vst2_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d14, d0
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s29, s16
+; CHECK-NEXT:    vmov.f32 s30, s1
+; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d14, d10
+; CHECK-NEXT:    vmov.f32 s29, s4
+; CHECK-NEXT:    vmov.f32 s30, s21
+; CHECK-NEXT:    vmov.f32 s31, s5
+; CHECK-NEXT:    vmov.f32 s4, s22
+; CHECK-NEXT:    vstrw.32 q7, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s23
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s16, s2
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s17, s18
+; CHECK-NEXT:    vmov.f32 s22, s25
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s8, s26
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s10, s27
+; CHECK-NEXT:    vmov.f64 d12, d0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s25, s12
+; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vmov.f32 s27, s13
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vstrw.32 q6, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s13, s14
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
+  %l1 = load <16 x float>, <16 x float>* %s1, align 4
+  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
+  %l2 = load <16 x float>, <16 x float>* %s2, align 4
+  %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x float> %s, <32 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
+; CHECK-LABEL: vst2_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrd r2, r0, [r0]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    strd r0, r2, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
+  %l1 = load <2 x half>, <2 x half>* %s1, align 4
+  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
+  %l2 = load <2 x half>, <2 x half>* %s2, align 4
+  %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x half> %s, <4 x half> *%dst
+  ret void
+}
+
+define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vst2_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12}
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    ldr r0, [r0, #12]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
+  %l1 = load <4 x half>, <4 x half>* %s1, align 4
+  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
+  %l2 = load <4 x half>, <4 x half>* %s2, align 4
+  %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %s, <8 x half> *%dst
+  ret void
+}
+
+define void @vst2_v8f16(<8 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vst2_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
+  %l1 = load <8 x half>, <8 x half>* %s1, align 4
+  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
+  %l2 = load <8 x half>, <8 x half>* %s2, align 4
+  %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x half> %s, <16 x half> *%dst
+  ret void
+}
+
+define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) {
+; CHECK-LABEL: vst2_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmovx.f16 s0, s12
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q4[1], r3
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s13
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s12, s15
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
+  %l1 = load <16 x half>, <16 x half>* %s1, align 4
+  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
+  %l2 = load <16 x half>, <16 x half>* %s2, align 4
+  %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x half> %s, <32 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
+; CHECK-LABEL: vst2_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d3
+; CHECK-NEXT:    vmov.f64 d5, d1
+; CHECK-NEXT:    vmov.f64 d3, d0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
+  %l1 = load <2 x double>, <2 x double>* %s1, align 4
+  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
+  %l2 = load <2 x double>, <2 x double>* %s2, align 4
+  %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x double> %s, <4 x double> *%dst
+  ret void
+}
+
+define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) {
+; CHECK-LABEL: vst2_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d8, d4
+; CHECK-NEXT:    vmov.f64 d9, d0
+; CHECK-NEXT:    vmov.f64 d0, d5
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f64 d4, d6
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d5, d2
+; CHECK-NEXT:    vmov.f64 d2, d7
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
+  %l1 = load <4 x double>, <4 x double>* %s1, align 4
+  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
+  %l2 = load <4 x double>, <4 x double>* %s2, align 4
+  %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x double> %s, <8 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
new file mode 100644
index 000000000000..1942be9313ec
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -0,0 +1,1970 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
+; CHECK-LABEL: vst3_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q1[3], lr
+; CHECK-NEXT:    vmov.f32 s8, s7
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s10, s0
+; CHECK-NEXT:    vmov.f32 s11, s5
+; CHECK-NEXT:    strd r2, r0, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
+  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
+  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
+  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
+  %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
+  %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
+  %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32> %s, <6 x i32> *%dst
+  ret void
+}
+
+define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) {
+; CHECK-LABEL: vst3_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vmov.f32 s13, s8
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmov.f32 s15, s17
+; CHECK-NEXT:    vmov.f32 s1, s19
+; CHECK-NEXT:    vmov.f32 s6, s18
+; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
+  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
+  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
+  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
+  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
+  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
+  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %s, <12 x i32> *%dst
+  ret void
+}
+
+define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) {
+; CHECK-LABEL: vst3_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.32 r2, q1[3]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vdup.32 q2, r2
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s13, s23
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vmov.f64 d4, d8
+; CHECK-NEXT:    vmov.f32 s9, s24
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    vmov.f32 s21, s4
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.32 r0, q4[3]
+; CHECK-NEXT:    vmov.f32 s4, s17
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s28, s30
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s7, s18
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s29, s27
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s6, s26
+; CHECK-NEXT:    vmov.f32 s30, s18
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #32]
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
+  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
+  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
+  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
+  %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
+  %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
+  %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %s, <24 x i32> *%dst
+  ret void
+}
+
+define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
+; CHECK-LABEL: vst3_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #160
+; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s4, s25
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s7, s26
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d4, d15
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.32 r0, q6[3]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s9, s23
+; CHECK-NEXT:    vmov.f32 s11, s31
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f64 d2, d9
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s19
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f64 d2, d10
+; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s24
+; CHECK-NEXT:    vmov.f32 s7, s21
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q4[0]
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.f32 s27, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.32 r0, q4[3]
+; CHECK-NEXT:    vmov q7, q0
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vmov.f32 s21, s4
+; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s22, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f64 d2, d0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #96]
+; CHECK-NEXT:    vstrw.32 q6, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s28, s1
+; CHECK-NEXT:    vmov.f32 s31, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s30, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
+  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
+  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
+  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
+  %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
+  %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
+  %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i32> %s, <48 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) {
+; CHECK-LABEL: vst3_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrh r2, [r0, #6]
+; CHECK-NEXT:    ldrh r3, [r0, #4]
+; CHECK-NEXT:    ldrh.w r12, [r0, #8]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    ldrh.w lr, [r0, #2]
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    ldrh r0, [r0, #10]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vdup.32 q1, r12
+; CHECK-NEXT:    vmov.f32 s3, s2
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
+  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
+  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
+  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
+  %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
+  %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
+  %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i16> %s, <6 x i16> *%dst
+  ret void
+}
+
+define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
+; CHECK-LABEL: vst3_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    vldrh.u32 q3, [r0, #8]
+; CHECK-NEXT:    vmov.f64 d0, d5
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s1, s7
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
+  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
+  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
+  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
+  %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
+  %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
+  %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i16> %s, <12 x i16> *%dst
+  ret void
+}
+
+define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
+; CHECK-LABEL: vst3_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vrev32.16 q5, q3
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[1]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
+  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
+  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
+  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
+  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
+  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
+  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i16> %s, <24 x i16> *%dst
+  ret void
+}
+
+define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
+; CHECK-LABEL: vst3_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    vmov.u16 r2, q5[5]
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[5]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[1]
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q7[2]
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.u16 r2, q6[0]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vrev32.16 q1, q5
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[3]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[4]
+; CHECK-NEXT:    vmov q1, q7
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov q7, q4
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vrev32.16 q1, q4
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[5]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[6]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[6]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[7]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
+; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
+  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
+  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
+  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
+  %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
+  %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
+  %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i16> %s, <48 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
+; CHECK-LABEL: vst3_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    ldrb r2, [r0]
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    ldrb r3, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrb.w r12, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    ldrb.w lr, [r0, #3]
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    ldrb r5, [r0, #5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb r0, [r0, #4]
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    add r0, sp, #8
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], lr
+; CHECK-NEXT:    vmov.16 q0[5], r5
+; CHECK-NEXT:    vmov.16 q0[6], r6
+; CHECK-NEXT:    vmov.16 q0[7], r6
+; CHECK-NEXT:    vstrb.16 q0, [r2]
+; CHECK-NEXT:    vstrb.16 q0, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    str r2, [r1]
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    strh r0, [r1, #4]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
+  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
+  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
+  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
+  %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
+  %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
+  %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i8> %s, <6 x i8> *%dst
+  ret void
+}
+
+define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) {
+; CHECK-LABEL: vst3_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrb.u32 q1, [r0]
+; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
+; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
+  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
+  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
+  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
+  %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
+  %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
+  %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i8> %s, <12 x i8> *%dst
+  ret void
+}
+
+define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
+; CHECK-LABEL: vst3_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vldrb.u16 q3, [r0]
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r2, q4[1]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vstrb.16 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
+  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
+  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
+  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
+  %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
+  %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
+  %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i8> %s, <24 x i8> *%dst
+  ret void
+}
+
+define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) {
+; CHECK-LABEL: vst3_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.8 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[5]
+; CHECK-NEXT:    vmov.8 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
+; CHECK-NEXT:    vmov.8 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[6]
+; CHECK-NEXT:    vmov.8 q4[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.8 q4[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vmov.8 q4[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[8]
+; CHECK-NEXT:    vmov.8 q4[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[8]
+; CHECK-NEXT:    vmov.8 q4[10], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q4[12], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[9]
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[7]
+; CHECK-NEXT:    vmov.8 q4[13], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[10]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[8]
+; CHECK-NEXT:    vmov.8 q4[15], r2
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[9]
+; CHECK-NEXT:    vmov.u8 r2, q4[0]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[10]
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r2, q4[1]
+; CHECK-NEXT:    vmov.8 q0[1], r2
+; CHECK-NEXT:    vmov.u8 r0, q5[2]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[3]
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[4]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[5]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[6]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[7]
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[8]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[9]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[10]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[11]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[12]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[13]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[14]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u8 r0, q5[0]
+; CHECK-NEXT:    vmov.8 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[1]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q6[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q6[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.8 q6[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.8 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[3]
+; CHECK-NEXT:    vmov.8 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[4]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[6]
+; CHECK-NEXT:    vmov.8 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[7]
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[9]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[10]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[12]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[13]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[15]
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[11]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[12]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[13]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[15]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.u8 r0, q5[0]
+; CHECK-NEXT:    vmov.8 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[1]
+; CHECK-NEXT:    vmov.8 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.8 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q3[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q3[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q3[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[3]
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[4]
+; CHECK-NEXT:    vmov.8 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[6]
+; CHECK-NEXT:    vmov.8 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[7]
+; CHECK-NEXT:    vmov.8 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[8]
+; CHECK-NEXT:    vmov.8 q2[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[9]
+; CHECK-NEXT:    vmov.8 q2[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[10]
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[11]
+; CHECK-NEXT:    vmov.8 q2[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[12]
+; CHECK-NEXT:    vmov.8 q2[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[13]
+; CHECK-NEXT:    vmov.8 q2[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[15]
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
+  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
+  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
+  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
+  %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
+  %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
+  %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i8> %s, <48 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
+; CHECK-LABEL: vst3_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vstrb.8 q1, [r0], #32
+; CHECK-NEXT:    vstrw.32 q3, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
+  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
+  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
+  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
+  %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
+  %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
+  %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i64> %s, <6 x i64> *%dst
+  ret void
+}
+
+define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) {
+; CHECK-LABEL: vst3_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vmov.f32 s22, s28
+; CHECK-NEXT:    vmov.f32 s23, s29
+; CHECK-NEXT:    vmov.f64 d14, d12
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s29, s25
+; CHECK-NEXT:    vmov.f64 d8, d7
+; CHECK-NEXT:    vmov.f32 s30, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s31, s13
+; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s4, s8
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmov.f32 s2, s26
+; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s3, s27
+; CHECK-NEXT:    vmov.f32 s9, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
+  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
+  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
+  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
+  %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
+  %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
+  %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i64> %s, <12 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) {
+; CHECK-LABEL: vst3_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vldr s3, [r0, #4]
+; CHECK-NEXT:    vldr s1, [r0, #8]
+; CHECK-NEXT:    ldr r2, [r0, #20]
+; CHECK-NEXT:    vldr s2, [r0, #16]
+; CHECK-NEXT:    ldr r0, [r0, #12]
+; CHECK-NEXT:    strd r0, r2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
+  %l1 = load <2 x float>, <2 x float>* %s1, align 4
+  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
+  %l2 = load <2 x float>, <2 x float>* %s2, align 4
+  %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
+  %l3 = load <2 x float>, <2 x float>* %s3, align 4
+  %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x float> %s, <6 x float> *%dst
+  ret void
+}
+
+define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) {
+; CHECK-LABEL: vst3_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmov.f64 d6, d8
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vmov.f32 s13, s8
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmov.f32 s15, s17
+; CHECK-NEXT:    vmov.f32 s1, s19
+; CHECK-NEXT:    vmov.f32 s6, s18
+; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
+  %l1 = load <4 x float>, <4 x float>* %s1, align 4
+  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
+  %l2 = load <4 x float>, <4 x float>* %s2, align 4
+  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
+  %l3 = load <4 x float>, <4 x float>* %s3, align 4
+  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x float> %s, <12 x float> *%dst
+  ret void
+}
+
+define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) {
+; CHECK-LABEL: vst3_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.32 r2, q1[3]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vdup.32 q2, r2
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s13, s23
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vmov.f64 d4, d8
+; CHECK-NEXT:    vmov.f32 s9, s24
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    vmov.f32 s21, s4
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.32 r0, q4[3]
+; CHECK-NEXT:    vmov.f32 s4, s17
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s28, s30
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s7, s18
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s29, s27
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s6, s26
+; CHECK-NEXT:    vmov.f32 s30, s18
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #32]
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
+  %l1 = load <8 x float>, <8 x float>* %s1, align 4
+  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
+  %l2 = load <8 x float>, <8 x float>* %s2, align 4
+  %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
+  %l3 = load <8 x float>, <8 x float>* %s3, align 4
+  %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x float> %s, <24 x float> *%dst
+  ret void
+}
+
+define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) {
+; CHECK-LABEL: vst3_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #160
+; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s4, s25
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s7, s26
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d4, d15
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.32 r0, q6[3]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s9, s23
+; CHECK-NEXT:    vmov.f32 s11, s31
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f64 d2, d9
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s19
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f64 d2, d10
+; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s24
+; CHECK-NEXT:    vmov.f32 s7, s21
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q4[0]
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.f32 s27, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.32 r0, q4[3]
+; CHECK-NEXT:    vmov q7, q0
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vmov.f32 s21, s4
+; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q0[0]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s22, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q7[0]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f64 d2, d0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #96]
+; CHECK-NEXT:    vstrw.32 q6, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s28, s1
+; CHECK-NEXT:    vmov.f32 s31, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s30, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
+  %l1 = load <16 x float>, <16 x float>* %s1, align 4
+  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
+  %l2 = load <16 x float>, <16 x float>* %s2, align 4
+  %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
+  %l3 = load <16 x float>, <16 x float>* %s3, align 4
+  %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x float> %s, <48 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
+; CHECK-LABEL: vst3_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12}
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.32 q2[0], r12
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q0[1], r3
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    stm r1!, {r0, r2, r3}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
+  %l1 = load <2 x half>, <2 x half>* %s1, align 4
+  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
+  %l2 = load <2 x half>, <2 x half>* %s2, align 4
+  %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
+  %l3 = load <2 x half>, <2 x half>* %s3, align 4
+  %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x half> %s, <6 x half> *%dst
+  ret void
+}
+
+define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
+; CHECK-LABEL: vst3_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.16 q0[1], r3
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.16 q0[3], r4
+; CHECK-NEXT:    vmov.32 q1[0], r12
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.32 q1[1], lr
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q3[0], r3
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[1], r4
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    strd r2, r0, [r1, #16]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
+  %l1 = load <4 x half>, <4 x half>* %s1, align 4
+  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
+  %l2 = load <4 x half>, <4 x half>* %s2, align 4
+  %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
+  %l3 = load <4 x half>, <4 x half>* %s3, align 4
+  %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x half> %s, <12 x half> *%dst
+  ret void
+}
+
+define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
+; CHECK-LABEL: vst3_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.16 q3[1], r3
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s3
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vrev32.16 q4, q0
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
+  %l1 = load <8 x half>, <8 x half>* %s1, align 4
+  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
+  %l2 = load <8 x half>, <8 x half>* %s2, align 4
+  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
+  %l3 = load <8 x half>, <8 x half>* %s3, align 4
+  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x half> %s, <24 x half> *%dst
+  ret void
+}
+
+define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
+; CHECK-LABEL: vst3_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vrev32.16 q3, q2
+; CHECK-NEXT:    vmov.16 q6[1], r3
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmovx.f16 s12, s14
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmovx.f16 s28, s3
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s11
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s7
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s20
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s16
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s12
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
+; CHECK-NEXT:    vmovx.f16 s24, s21
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vrev32.16 q7, q4
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov r0, s29
+; CHECK-NEXT:    vmovx.f16 s8, s13
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmovx.f16 s8, s30
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s22
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s18
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s14
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmovx.f16 s8, s23
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s19
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s15
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s16
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
+  %l1 = load <16 x half>, <16 x half>* %s1, align 4
+  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
+  %l2 = load <16 x half>, <16 x half>* %s2, align 4
+  %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
+  %l3 = load <16 x half>, <16 x half>* %s3, align 4
+  %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x half> %s, <48 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) {
+; CHECK-LABEL: vst3_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vmov.f64 d7, d1
+; CHECK-NEXT:    vmov.f64 d1, d4
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d2, d5
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
+  %l1 = load <2 x double>, <2 x double>* %s1, align 4
+  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
+  %l2 = load <2 x double>, <2 x double>* %s2, align 4
+  %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
+  %l3 = load <2 x double>, <2 x double>* %s3, align 4
+  %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x double> %s, <6 x double> *%dst
+  ret void
+}
+
+define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) {
+; CHECK-LABEL: vst3_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d6, d15
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vmov.f64 d7, d1
+; CHECK-NEXT:    vmov.f64 d11, d12
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f64 d1, d5
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d2, d8
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d13, d14
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
+  %l1 = load <4 x double>, <4 x double>* %s1, align 4
+  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
+  %l2 = load <4 x double>, <4 x double>* %s2, align 4
+  %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
+  %l3 = load <4 x double>, <4 x double>* %s3, align 4
+  %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x double> %s, <12 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
new file mode 100644
index 000000000000..ab9e39be242f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -0,0 +1,2289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
+
+; i32
+
+define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vst4_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f32 s1, s6
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], lr
+; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s10, s0
+; CHECK-NEXT:    vmov.f32 s11, s2
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
+  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
+  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
+  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
+  %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
+  %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
+  %s4 = getelementptr <2 x i32>, <2 x i32>* %src, i32 3
+  %l4 = load <2 x i32>, <2 x i32>* %s3, align 4
+  %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32> %s, <8 x i32> *%dst
+  ret void
+}
+
+define void @vst4_v4i32(<4 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vst4_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.32 r0, q3[0]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s20, s11
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vmov.f32 s8, s10
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
+  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
+  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
+  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
+  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
+  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
+  %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3
+  %l4 = load <4 x i32>, <4 x i32>* %s3, align 4
+  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %s, <16 x i32> *%dst
+  ret void
+}
+
+define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) {
+; CHECK-LABEL: vst4_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d2, d6
+; CHECK-NEXT:    vmov.32 r2, q5[0]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[1]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s1, s17
+; CHECK-NEXT:    vmov.f64 d2, d7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[2]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s5, s18
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s12, s15
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[3]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov.32 r0, q2[0]
+; CHECK-NEXT:    vmov.f64 d10, d14
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s21, s24
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f32 s22, s2
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f64 d8, d15
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vmov.f32 s17, s26
+; CHECK-NEXT:    vmov.f32 s24, s29
+; CHECK-NEXT:    vmov.f32 s1, s25
+; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s4, s31
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vstrw.32 q1, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
+  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
+  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
+  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
+  %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
+  %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
+  %s4 = getelementptr <8 x i32>, <8 x i32>* %src, i32 3
+  %l4 = load <8 x i32>, <8 x i32>* %s3, align 4
+  %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %s, <32 x i32> *%dst
+  ret void
+}
+
+define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
+; CHECK-LABEL: vst4_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #160
+; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT:    vmov.32 r2, q1[2]
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vdup.32 q3, r2
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s18, s14
+; CHECK-NEXT:    vstrw.32 q6, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s19, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #224]
+; CHECK-NEXT:    vmov.f32 s16, s11
+; CHECK-NEXT:    vmov.32 r0, q1[3]
+; CHECK-NEXT:    vmov.f32 s17, s3
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #240]
+; CHECK-NEXT:    vmov.f64 d8, d4
+; CHECK-NEXT:    vmov.32 r0, q1[0]
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #192]
+; CHECK-NEXT:    vmov.32 r0, q1[1]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #208]
+; CHECK-NEXT:    vmov.f64 d0, d9
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s1, s26
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vmov.f32 s0, s19
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s8, s15
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f64 d2, d6
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vmov.32 r0, q5[0]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s28
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q5[1]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s28, s13
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s1, s29
+; CHECK-NEXT:    vmov.f64 d2, d7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q5[2]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.32 r0, q5[3]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s30
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s9, s31
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q3[0]
+; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vmov.f32 s25, s20
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s17, s22
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s4, s11
+; CHECK-NEXT:    vmov.f32 s28, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s23
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s29, s21
+; CHECK-NEXT:    vmov.32 r0, q2[0]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmov.f32 s3, s23
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vstrw.32 q1, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s15, s11
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [r1, #144]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
+  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
+  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
+  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
+  %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
+  %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
+  %s4 = getelementptr <16 x i32>, <16 x i32>* %src, i32 3
+  %l4 = load <16 x i32>, <16 x i32>* %s3, align 4
+  %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i32> %s, <64 x i32> *%dst
+  ret void
+}
+
+; i16
+
+define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vst4_v2i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    ldrh.w lr, [r0, #4]
+; CHECK-NEXT:    ldrh r3, [r0, #8]
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    ldrh.w r12, [r0, #6]
+; CHECK-NEXT:    ldrh r2, [r0, #10]
+; CHECK-NEXT:    ldrh r0, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    vmov.16 q0[1], lr
+; CHECK-NEXT:    vmov.16 q0[2], r3
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
+  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
+  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
+  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
+  %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
+  %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
+  %s4 = getelementptr <2 x i16>, <2 x i16>* %src, i32 3
+  %l4 = load <2 x i16>, <2 x i16>* %s3, align 4
+  %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i16> %s, <8 x i16> *%dst
+  ret void
+}
+
+define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vst4_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
+  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
+  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
+  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
+  %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
+  %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
+  %s4 = getelementptr <4 x i16>, <4 x i16>* %src, i32 3
+  %l4 = load <4 x i16>, <4 x i16>* %s3, align 4
+  %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i16> %s, <16 x i16> *%dst
+  ret void
+}
+
+define void @vst4_v8i16(<8 x i16> *%src, <32 x i16> *%dst) {
+; CHECK-LABEL: vst4_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r2, q4[1]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[1]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q7[2], r0
+; CHECK-NEXT:    vmov.16 q7[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q7[6], r0
+; CHECK-NEXT:    vmov.16 q7[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[2]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q7[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[1]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
+  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
+  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
+  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
+  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
+  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
+  %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3
+  %l4 = load <8 x i16>, <8 x i16>* %s3, align 4
+  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i16> %s, <32 x i16> *%dst
+  ret void
+}
+
+define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) {
+; CHECK-LABEL: vst4_v16i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #48]
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[1]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[1]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q2[6], r2
+; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[2]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[3]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.16 q2[6], r2
+; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[4]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[5]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vstrw.32 q5, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q2[6], r2
+; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q7[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q7[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[6], r2
+; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q7[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q7[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q7[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.16 q7[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q7[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.16 q7[7], r2
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[0]
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[5]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[6]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[7]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vstrw.32 q2, [r1, #96]
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vstrw.32 q6, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
+  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
+  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
+  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
+  %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
+  %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
+  %s4 = getelementptr <16 x i16>, <16 x i16>* %src, i32 3
+  %l4 = load <16 x i16>, <16 x i16>* %s3, align 4
+  %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i16> %s, <64 x i16> *%dst
+  ret void
+}
+
+; i8
+
+define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) {
+; CHECK-LABEL: vst4_v2i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrb r2, [r0]
+; CHECK-NEXT:    ldrb r3, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrb.w r12, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    ldrb.w lr, [r0, #3]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrb r4, [r0, #5]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrb r0, [r0, #4]
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.16 q0[4], r3
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r4
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
+  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
+  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
+  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
+  %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
+  %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
+  %s4 = getelementptr <2 x i8>, <2 x i8>* %src, i32 3
+  %l4 = load <2 x i8>, <2 x i8>* %s3, align 4
+  %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i8> %s, <8 x i8> *%dst
+  ret void
+}
+
+define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) {
+; CHECK-LABEL: vst4_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q1, [r0]
+; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
+; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.8 q0[1], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
+  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
+  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
+  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
+  %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
+  %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
+  %s4 = getelementptr <4 x i8>, <4 x i8>* %src, i32 3
+  %l4 = load <4 x i8>, <4 x i8>* %s3, align 4
+  %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i8> %s, <16 x i8> *%dst
+  ret void
+}
+
+define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) {
+; CHECK-LABEL: vst4_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrb.u16 q1, [r0]
+; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
+; CHECK-NEXT:    vldrb.u16 q3, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.8 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q4[2], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.8 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q4[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q4[6], r0
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
+  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
+  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
+  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
+  %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
+  %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
+  %s4 = getelementptr <8 x i8>, <8 x i8>* %src, i32 3
+  %l4 = load <8 x i8>, <8 x i8>* %s3, align 4
+  %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i8> %s, <32 x i8> *%dst
+  ret void
+}
+
+define void @vst4_v16i8(<16 x i8> *%src, <64 x i8> *%dst) {
+; CHECK-LABEL: vst4_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    vmov.8 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[4]
+; CHECK-NEXT:    vmov.8 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.u8 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q4[4], r2
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r2, q2[5]
+; CHECK-NEXT:    vmov.8 q4[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.8 q4[8], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[6]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.8 q4[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.u8 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q4[12], r2
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vmov.8 q4[13], r2
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[7]
+; CHECK-NEXT:    vmov.u8 r2, q4[0]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov.u8 r2, q4[1]
+; CHECK-NEXT:    vmov.8 q0[1], r2
+; CHECK-NEXT:    vmov.u8 r0, q5[2]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[3]
+; CHECK-NEXT:    vmov.8 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[4]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[5]
+; CHECK-NEXT:    vmov.8 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[6]
+; CHECK-NEXT:    vmov.8 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[7]
+; CHECK-NEXT:    vmov.8 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[8]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[9]
+; CHECK-NEXT:    vmov.8 q0[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[10]
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[11]
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[12]
+; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[13]
+; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[14]
+; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[15]
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.u8 r0, q5[0]
+; CHECK-NEXT:    vmov.8 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[1]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q6[2], r0
+; CHECK-NEXT:    vmov.8 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q6[6], r0
+; CHECK-NEXT:    vmov.8 q6[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q6[10], r0
+; CHECK-NEXT:    vmov.8 q6[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q6[14], r0
+; CHECK-NEXT:    vmov.8 q6[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.8 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.8 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[4]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[5]
+; CHECK-NEXT:    vmov.8 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.8 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[8]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[9]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[12]
+; CHECK-NEXT:    vmov.8 q4[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q5[13]
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.8 q4[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.8 q4[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.8 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.8 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q6[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q6[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.8 q6[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q6[13], r0
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.8 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.8 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[12]
+; CHECK-NEXT:    vmov.8 q7[2], r0
+; CHECK-NEXT:    vmov.8 q7[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[13]
+; CHECK-NEXT:    vmov.8 q7[6], r0
+; CHECK-NEXT:    vmov.8 q7[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.8 q7[10], r0
+; CHECK-NEXT:    vmov.8 q7[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[15]
+; CHECK-NEXT:    vmov.8 q7[14], r0
+; CHECK-NEXT:    vmov.8 q7[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[2]
+; CHECK-NEXT:    vmov.8 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[3]
+; CHECK-NEXT:    vmov.8 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.8 q5[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q5[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[6]
+; CHECK-NEXT:    vmov.8 q5[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[7]
+; CHECK-NEXT:    vmov.8 q5[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[10]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[11]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[14]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[15]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.8 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.8 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.8 q6[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.8 q6[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q6[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q6[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.8 q6[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q6[13], r0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.8 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.8 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[8]
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[9]
+; CHECK-NEXT:    vmov.8 q2[6], r0
+; CHECK-NEXT:    vmov.8 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[10]
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.8 q2[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[11]
+; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.8 q1[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.8 q1[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.8 q1[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.8 q1[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
+; CHECK-NEXT:    vmov.8 q1[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.8 q1[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.8 q1[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.8 q1[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.8 q1[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.8 q1[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.8 q1[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.8 q1[15], r0
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
+  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
+  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
+  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
+  %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
+  %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
+  %s4 = getelementptr <16 x i8>, <16 x i8>* %src, i32 3
+  %l4 = load <16 x i8>, <16 x i8>* %s3, align 4
+  %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i8> %s, <64 x i8> *%dst
+  ret void
+}
+
+; i64
+
+define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
+; CHECK-LABEL: vst4_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d4, d6
+; CHECK-NEXT:    vmov.f32 s9, s13
+; CHECK-NEXT:    vmov.f32 s10, s0
+; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vmov.f32 s0, s14
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s15
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s13, s5
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
+  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
+  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
+  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
+  %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
+  %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
+  %s4 = getelementptr <2 x i64>, <2 x i64>* %src, i32 3
+  %l4 = load <2 x i64>, <2 x i64>* %s3, align 4
+  %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i64> %s, <8 x i64> *%dst
+  ret void
+}
+
+define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) {
+; CHECK-LABEL: vst4_v4i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d6, d10
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s13, s21
+; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s0, s22
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s23
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s21, s25
+; CHECK-NEXT:    vmov.f32 s22, s8
+; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vmov.f32 s8, s26
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s9, s27
+; CHECK-NEXT:    vmov.f64 d12, d2
+; CHECK-NEXT:    vstrw.32 q2, [r1, #96]
+; CHECK-NEXT:    vmov.f64 d14, d8
+; CHECK-NEXT:    vmov.f32 s25, s5
+; CHECK-NEXT:    vmov.f32 s29, s17
+; CHECK-NEXT:    vmov.f32 s26, s4
+; CHECK-NEXT:    vmov.f32 s30, s16
+; CHECK-NEXT:    vmov.f32 s27, s5
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vmov.f32 s16, s18
+; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s17, s19
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
+  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
+  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
+  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
+  %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
+  %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
+  %s4 = getelementptr <4 x i64>, <4 x i64>* %src, i32 3
+  %l4 = load <4 x i64>, <4 x i64>* %s3, align 4
+  %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i64> %s, <16 x i64> *%dst
+  ret void
+}
+
+; f32
+
+define void @vst4_v2f32(<2 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vst4_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vldr s4, [r0, #4]
+; CHECK-NEXT:    vldr s1, [r0, #8]
+; CHECK-NEXT:    vldr s5, [r0, #12]
+; CHECK-NEXT:    vldr s2, [r0, #16]
+; CHECK-NEXT:    vldr s6, [r0, #20]
+; CHECK-NEXT:    vmov.f32 s3, s2
+; CHECK-NEXT:    vmov.f32 s7, s6
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
+  %l1 = load <2 x float>, <2 x float>* %s1, align 4
+  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
+  %l2 = load <2 x float>, <2 x float>* %s2, align 4
+  %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
+  %l3 = load <2 x float>, <2 x float>* %s3, align 4
+  %s4 = getelementptr <2 x float>, <2 x float>* %src, i32 3
+  %l4 = load <2 x float>, <2 x float>* %s3, align 4
+  %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x float> %s, <8 x float> *%dst
+  ret void
+}
+
+define void @vst4_v4f32(<4 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vst4_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.32 r0, q3[0]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vdup.32 q4, r0
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s20, s11
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vmov.f32 s8, s10
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vstrw.32 q5, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
+  %l1 = load <4 x float>, <4 x float>* %s1, align 4
+  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
+  %l2 = load <4 x float>, <4 x float>* %s2, align 4
+  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
+  %l3 = load <4 x float>, <4 x float>* %s3, align 4
+  %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3
+  %l4 = load <4 x float>, <4 x float>* %s3, align 4
+  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x float> %s, <16 x float> *%dst
+  ret void
+}
+
+define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) {
+; CHECK-LABEL: vst4_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d2, d6
+; CHECK-NEXT:    vmov.32 r2, q5[0]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[1]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s1, s17
+; CHECK-NEXT:    vmov.f64 d2, d7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[2]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s5, s18
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s12, s15
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r2, q5[3]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov.32 r0, q2[0]
+; CHECK-NEXT:    vmov.f64 d10, d14
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s21, s24
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f32 s22, s2
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f64 d8, d15
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vmov.f32 s17, s26
+; CHECK-NEXT:    vmov.f32 s24, s29
+; CHECK-NEXT:    vmov.f32 s1, s25
+; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vmov.f32 s19, s7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s4, s31
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vstrw.32 q1, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
+  %l1 = load <8 x float>, <8 x float>* %s1, align 4
+  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
+  %l2 = load <8 x float>, <8 x float>* %s2, align 4
+  %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
+  %l3 = load <8 x float>, <8 x float>* %s3, align 4
+  %s4 = getelementptr <8 x float>, <8 x float>* %src, i32 3
+  %l4 = load <8 x float>, <8 x float>* %s3, align 4
+  %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x float> %s, <32 x float> *%dst
+  ret void
+}
+
+define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
+; CHECK-LABEL: vst4_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #160
+; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT:    vmov.32 r2, q1[2]
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vdup.32 q3, r2
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s18, s14
+; CHECK-NEXT:    vstrw.32 q6, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s19, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #224]
+; CHECK-NEXT:    vmov.f32 s16, s11
+; CHECK-NEXT:    vmov.32 r0, q1[3]
+; CHECK-NEXT:    vmov.f32 s17, s3
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #240]
+; CHECK-NEXT:    vmov.f64 d8, d4
+; CHECK-NEXT:    vmov.32 r0, q1[0]
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vstrw.32 q4, [r1, #192]
+; CHECK-NEXT:    vmov.32 r0, q1[1]
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #208]
+; CHECK-NEXT:    vmov.f64 d0, d9
+; CHECK-NEXT:    vmov.32 r0, q2[2]
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s1, s26
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vmov.f32 s0, s19
+; CHECK-NEXT:    vmov.32 r0, q2[3]
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmov.f32 s8, s15
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f64 d2, d6
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vmov.32 r0, q5[0]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s28
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q5[1]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s28, s13
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s1, s29
+; CHECK-NEXT:    vmov.f64 d2, d7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.32 r0, q5[2]
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.32 r0, q5[3]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s30
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s9, s31
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 r0, q3[0]
+; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.32 r0, q3[1]
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.32 r0, q3[2]
+; CHECK-NEXT:    vmov.f32 s25, s20
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.32 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s17, s22
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s4, s11
+; CHECK-NEXT:    vmov.f32 s28, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov.f32 s5, s23
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s12, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s29, s21
+; CHECK-NEXT:    vmov.32 r0, q2[0]
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.32 r0, q2[1]
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmov.f32 s3, s23
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vstrw.32 q1, [r1, #112]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s15, s11
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [r1, #144]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
+  %l1 = load <16 x float>, <16 x float>* %s1, align 4
+  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
+  %l2 = load <16 x float>, <16 x float>* %s2, align 4
+  %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
+  %l3 = load <16 x float>, <16 x float>* %s3, align 4
+  %s4 = getelementptr <16 x float>, <16 x float>* %src, i32 3
+  %l4 = load <16 x float>, <16 x float>* %s3, align 4
+  %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x float> %s, <64 x float> *%dst
+  ret void
+}
+
+; f16
+
+define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vst4_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrd r3, r12, [r0]
+; CHECK-NEXT:    ldr r0, [r0, #8]
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q2[0], r12
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q1[0], r3
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q0[0], lr
+; CHECK-NEXT:    vmov.16 q0[1], r4
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q1[0], r3
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmov.16 q1[3], lr
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.16 q1[6], r4
+; CHECK-NEXT:    vmov.16 q1[7], r4
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
+  %l1 = load <2 x half>, <2 x half>* %s1, align 4
+  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
+  %l2 = load <2 x half>, <2 x half>* %s2, align 4
+  %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
+  %l3 = load <2 x half>, <2 x half>* %s3, align 4
+  %s4 = getelementptr <2 x half>, <2 x half>* %src, i32 3
+  %l4 = load <2 x half>, <2 x half>* %s3, align 4
+  %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x half> %s, <8 x half> *%dst
+  ret void
+}
+
+define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vst4_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
+; CHECK-NEXT:    ldm.w r0, {r4, r5, r6}
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    ldr r0, [r0, #12]
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov lr, s4
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov.16 q1[0], r4
+; CHECK-NEXT:    vmov.16 q1[1], r5
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vmov.16 q0[0], lr
+; CHECK-NEXT:    vmov.16 q1[2], r4
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov.32 q2[0], r6
+; CHECK-NEXT:    vmov.16 q1[3], r4
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.16 q0[2], r3
+; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q3[0], r5
+; CHECK-NEXT:    vmovx.f16 s16, s9
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[3], r3
+; CHECK-NEXT:    vmov.16 q3[4], r4
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[6], r2
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.16 q3[7], r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[3], lr
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.16 q3[7], r12
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
+  %l1 = load <4 x half>, <4 x half>* %s1, align 4
+  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
+  %l2 = load <4 x half>, <4 x half>* %s2, align 4
+  %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
+  %l3 = load <4 x half>, <4 x half>* %s3, align 4
+  %s4 = getelementptr <4 x half>, <4 x half>* %src, i32 3
+  %l4 = load <4 x half>, <4 x half>* %s3, align 4
+  %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x half> %s, <16 x half> *%dst
+  ret void
+}
+
+define void @vst4_v8f16(<8 x half> *%src, <32 x half> *%dst) {
+; CHECK-LABEL: vst4_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov.16 q3[1], r3
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s3
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s16, s9
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
+  %l1 = load <8 x half>, <8 x half>* %s1, align 4
+  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
+  %l2 = load <8 x half>, <8 x half>* %s2, align 4
+  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
+  %l3 = load <8 x half>, <8 x half>* %s3, align 4
+  %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3
+  %l4 = load <8 x half>, <8 x half>* %s3, align 4
+  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x half> %s, <32 x half> *%dst
+  ret void
+}
+
+define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) {
+; CHECK-LABEL: vst4_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmovx.f16 s0, s22
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov.16 q6[0], r3
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s14
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s18
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vstrw.32 q6, [r1, #96]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmovx.f16 s28, s23
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s15
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s19
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmovx.f16 s28, s20
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vstrw.32 q6, [r1, #112]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s12
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmovx.f16 s28, s16
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmovx.f16 s20, s21
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmovx.f16 s12, s13
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s17
+; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s16, s10
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s3
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q3[1], r2
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
+  %l1 = load <16 x half>, <16 x half>* %s1, align 4
+  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
+  %l2 = load <16 x half>, <16 x half>* %s2, align 4
+  %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
+  %l3 = load <16 x half>, <16 x half>* %s3, align 4
+  %s4 = getelementptr <16 x half>, <16 x half>* %src, i32 3
+  %l4 = load <16 x half>, <16 x half>* %s3, align 4
+  %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x half> %s, <64 x half> *%dst
+  ret void
+}
+
+; f64
+
+define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
+; CHECK-LABEL: vst4_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vmov.f64 d7, d0
+; CHECK-NEXT:    vmov.f64 d0, d3
+; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f64 d2, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f64 d3, d4
+; CHECK-NEXT:    vmov.f64 d4, d5
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
+  %l1 = load <2 x double>, <2 x double>* %s1, align 4
+  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
+  %l2 = load <2 x double>, <2 x double>* %s2, align 4
+  %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
+  %l3 = load <2 x double>, <2 x double>* %s3, align 4
+  %s4 = getelementptr <2 x double>, <2 x double>* %src, i32 3
+  %l4 = load <2 x double>, <2 x double>* %s3, align 4
+  %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x double> %s, <8 x double> *%dst
+  ret void
+}
+
+define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) {
+; CHECK-LABEL: vst4_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmov.f64 d4, d10
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmov.f64 d5, d0
+; CHECK-NEXT:    vmov.f64 d0, d11
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f64 d11, d6
+; CHECK-NEXT:    vmov.f64 d6, d13
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d12, d2
+; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
+; CHECK-NEXT:    vmov.f64 d14, d8
+; CHECK-NEXT:    vmov.f64 d13, d2
+; CHECK-NEXT:    vmov.f64 d15, d8
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d2, d3
+; CHECK-NEXT:    vstrw.32 q7, [r1, #80]
+; CHECK-NEXT:    vmov.f64 d8, d9
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
+  %l1 = load <4 x double>, <4 x double>* %s1, align 4
+  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
+  %l2 = load <4 x double>, <4 x double>* %s2, align 4
+  %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
+  %l3 = load <4 x double>, <4 x double>* %s3, align 4
+  %s4 = getelementptr <4 x double>, <4 x double>* %src, i32 3
+  %l4 = load <4 x double>, <4 x double>* %s3, align 4
+  %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x double> %s, <16 x double> *%dst
+  ret void
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
index fdf6e1feda2d..1328c22a25c2 100644
--- a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
@@ -1,19 +1,30 @@
-; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
-; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -mattr=+mve.fp -interleaved-access -S | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NONE
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 target triple = "arm---eabi"
 
 define void @load_factor2(<16 x i8>* %ptr) {
-; NEON-LABEL:    @load_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
   %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -22,16 +33,27 @@ define void @load_factor2(<16 x i8>* %ptr) {
 }
 
 define void @load_factor3(<6 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <6 x i32>, <6 x i32>* %ptr, align 4
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -41,17 +63,30 @@ define void @load_factor3(<6 x i32>* %ptr) {
 }
 
 define void @load_factor4(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -62,15 +97,22 @@ define void @load_factor4(<16 x i32>* %ptr) {
 }
 
 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
-; NEON-LABEL:    @store_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
@@ -78,16 +120,29 @@ define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
 }
 
 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-; NEON-LABEL:    @store_factor3(
-; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -97,17 +152,30 @@ define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
 }
 
 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; NEON-LABEL:    @store_factor4(
-; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -117,15 +185,22 @@ define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
 }
 
 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
   %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
@@ -133,19 +208,30 @@ define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
 }
 
 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
   %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
@@ -155,21 +241,34 @@ define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
 }
 
 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
   %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
@@ -180,17 +279,24 @@ define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
 }
 
 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
-; NEON-LABEL:    @store_ptrvec_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <4 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x i32*> [[V0:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x i32*> [[V1:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
@@ -198,18 +304,31 @@ define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
-; NEON-LABEL:    @store_ptrvec_factor3(
-; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <6 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-MVE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NONE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -219,19 +338,32 @@ define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
-; NEON-LABEL:    @store_ptrvec_factor4(
-; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <8 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -241,15 +373,24 @@ define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
@@ -258,16 +399,27 @@ define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
 }
 
 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -277,17 +429,30 @@ define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
 }
 
 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
@@ -298,15 +463,22 @@ define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
 }
 
 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
-; NEON-LABEL:    @store_undef_mask_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -314,16 +486,29 @@ define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-; NEON-LABEL:    @store_undef_mask_factor3(
-; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -333,17 +518,30 @@ define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; NEON-LABEL:    @store_undef_mask_factor4(
-; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -353,16 +551,27 @@ define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @load_address_space(<8 x i32> addrspace(1)* %ptr) {
-; NEON-LABEL:    @load_address_space(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_address_space(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -372,15 +581,22 @@ define void @load_address_space(<8 x i32> addrspace(1)* %ptr) {
 }
 
 define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) {
-; NEON-LABEL:    @store_address_space(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_address_space(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   store <4 x i32> %interleaved.vec, <4 x i32> addrspace(1)* %ptr
@@ -388,9 +604,23 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2
 }
 
 define void @load_f16_factor2(<8 x half>* %ptr) {
-; ALL-LABEL: @load_f16_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
   %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -399,9 +629,20 @@ define void @load_f16_factor2(<8 x half>* %ptr) {
 }
 
 define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
-; ALL-LABEL: @store_f16_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEON-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-MVE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
@@ -409,9 +650,20 @@ define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1)
 }
 
 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
-; ALL-LABEL:    @load_illegal_factor2(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @load_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
@@ -419,9 +671,20 @@ define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 }
 
 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
-; ALL-LABEL: @store_illegal_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
@@ -429,17 +692,24 @@ define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind
 }
 
 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -447,17 +717,24 @@ define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i
 }
 
 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefbeg(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -465,17 +742,24 @@ define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefend(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -483,17 +767,24 @@ define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefmid(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -501,17 +792,24 @@ define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefmulti(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -519,16 +817,23 @@ define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -536,16 +841,23 @@ define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x
 }
 
 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undefmultimid(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -553,9 +865,20 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL: @store_general_mask_factor3_undef_fail(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -563,16 +886,23 @@ define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32>
 }
 
 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undeflane(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -580,9 +910,20 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL:    @store_general_mask_factor3_endstart_fail(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -590,16 +931,23 @@ define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_endstart_pass(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_endstart_pass(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -607,9 +955,20 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL:    @store_general_mask_factor3_midstart_fail(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -617,16 +976,23 @@ define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_midstart_pass(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_midstart_pass(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -636,35 +1002,54 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i3
 @g = external global <4 x float>
 
 ; The following does not give a valid interleaved store
-; ALL-LABEL: define void @no_interleave
-; ALL-NOT: call void @llvm.arm.neon.vst2
-; ALL: shufflevector
-; ALL: store
-; ALL: ret void
 define void @no_interleave(<4 x float> %a0) {
+; CHECK-NEON-LABEL: @no_interleave(
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NEON-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @no_interleave(
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-MVE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @no_interleave(
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NONE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NONE-NEXT:    ret void
+;
   %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
   store <4 x float> %v0, <4 x float>* @g, align 16
   ret void
 }
 
 define void @load_factor2_wide2(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -673,32 +1058,41 @@ define void @load_factor2_wide2(<16 x i32>* %ptr) {
 }
 
 define void @load_factor2_wide3(<24 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
-; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
-; NEON-NEXT:       [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
-; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
@@ -707,26 +1101,37 @@ define void @load_factor2_wide3(<24 x i32>* %ptr) {
 }
 
 define void @load_factor3_wide(<24 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor3_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
-; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor3_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor3_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -736,29 +1141,42 @@ define void @load_factor3_wide(<24 x i32>* %ptr) {
 }
 
 define void @load_factor4_wide(<32 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor4_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
-; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor4_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor4_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
   %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -769,21 +1187,28 @@ define void @load_factor4_wide(<32 x i32>* %ptr) {
 }
 
 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
-; NEON-LABEL:    @store_factor2_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor2_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor2_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
@@ -791,23 +1216,36 @@ define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1)
 }
 
 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
-; NEON-LABEL:    @store_factor3_wide(
-; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
-; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
-; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor3_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor3_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-MVE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NONE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -817,25 +1255,38 @@ define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1,
 }
 
 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
-; NEON-LABEL:    @store_factor4_wide(
-; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor4_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor4_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-MVE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NONE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -845,9 +1296,23 @@ define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1,
 }
 
 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
-; ALL-LABEL: @load_factor2_fp128(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_factor2_fp128(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_fp128(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_fp128(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
   %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
@@ -856,27 +1321,36 @@ define void @load_factor2_fp128(<4 x fp128>* %ptr) {
 }
 
 define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide_pointer(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide_pointer(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide_pointer(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide_pointer(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide_pointer(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
   %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -888,9 +1362,24 @@ define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
 ; actually load enough elements to satisfy the shuffle masks. (It would be
 ; possible to produce a vld2.v2i32, but that currently isn't implemented.)
 define void @load_out_of_range(<4 x i32>* %ptr) {
-; ALL-LABEL: @load_out_of_range(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_out_of_range(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_out_of_range(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_out_of_range(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
   %interleaved.vec = load <4 x i32>, <4 x i32>* %ptr, align 4
   %v0 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %v1 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 7c73323880f4..c25071156a27 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -7,11 +7,23 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1-m.main-none-eabi"
 
+; Factor 2
+
 %i8.2 = type {i8, i8}
 define void @i8_factor_2(%i8.2* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-LABEL:  Checking a loop in "i8_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_2"
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
@@ -43,6 +55,11 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_2"
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
@@ -115,11 +132,193 @@ for.end:
   ret void
 }
 
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 80 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 288 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 160 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 1088 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 576 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.2 = type {half, half}
+define void @f16_factor_2(%f16.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_2"
+; VF_8:          Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_2"
+; VF_16:         Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 544 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.2 = type {float, float}
+define void @f32_factor_2(%f32.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 1
+  %tmp2 = load float, float* %tmp0, align 4
+  %tmp3 = load float, float* %tmp1, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.2 = type {double, double}
+define void @f64_factor_2(%f64.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_2"
+; VF_8:          Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_2"
+; VF_16:         Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 544 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 1
+  %tmp2 = load double, double* %tmp0, align 8
+  %tmp3 = load double, double* %tmp1, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+
+; Factor 3
+
 %i8.3 = type {i8, i8, i8}
 define void @i8_factor_3(%i8.3* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_4-LABEL:  Checking a loop in "i8_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_3"
 ; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
@@ -158,6 +357,13 @@ define void @i16_factor_3(%i16.3* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_3"
 ; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
@@ -250,12 +456,240 @@ for.end:
   ret void
 }
 
+%i64.3 = type {i64, i64, i64}
+define void @i64_factor_3(%i64.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_3"
+; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_3"
+; VF_4:          Found an estimated cost of 120 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_3"
+; VF_8:          Found an estimated cost of 432 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 240 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_3"
+; VF_16:         Found an estimated cost of 1632 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 864 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 2
+  %tmp3 = load i64, i64* %tmp0, align 8
+  %tmp4 = load i64, i64* %tmp1, align 8
+  %tmp5 = load i64, i64* %tmp2, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  store i64 0, i64* %tmp2, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.3 = type {half, half, half}
+define void @f16_factor_3(%f16.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 2
+  %tmp3 = load half, half* %tmp0, align 2
+  %tmp4 = load half, half* %tmp1, align 2
+  %tmp5 = load half, half* %tmp2, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  store half 0.0, half* %tmp2, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.3 = type {float, float, float}
+define void @f32_factor_3(%f32.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 2
+  %tmp3 = load float, float* %tmp0, align 4
+  %tmp4 = load float, float* %tmp1, align 4
+  %tmp5 = load float, float* %tmp2, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  store float 0.0, float* %tmp2, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.3 = type {double, double, double}
+define void @f64_factor_3(%f64.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 2
+  %tmp3 = load double, double* %tmp0, align 8
+  %tmp4 = load double, double* %tmp1, align 8
+  %tmp5 = load double, double* %tmp2, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  store double 0.0, double* %tmp2, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Factor 4
 
 %i8.4 = type {i8, i8, i8, i8}
 define void @i8_factor_4(%i8.4* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i8 0, i8* %tmp3, align 1
+; VF_4-LABEL: Checking a loop in "i8_factor_4"
+; VF_4:         Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 80 for VF 4 For instruction: store i8 0, i8* %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_4"
 ; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
@@ -301,6 +735,15 @@ define void @i16_factor_4(%i16.4* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i16 0, i16* %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_4"
 ; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
@@ -413,4 +856,256 @@ for.end:
   ret void
 }
 
+%i64.4 = type {i64, i64, i64, i64}
+define void @i64_factor_4(%i64.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_4"
+; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 32 for VF 2 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_4"
+; VF_4:          Found an estimated cost of 160 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 96 for VF 4 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_4"
+; VF_8:          Found an estimated cost of 576 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 320 for VF 8 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_4"
+; VF_16:         Found an estimated cost of 2176 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 1152 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 3
+  %tmp4 = load i64, i64* %tmp0, align 8
+  %tmp5 = load i64, i64* %tmp1, align 8
+  %tmp6 = load i64, i64* %tmp2, align 8
+  %tmp7 = load i64, i64* %tmp3, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  store i64 0, i64* %tmp2, align 8
+  store i64 0, i64* %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.4 = type {half, half, half, half}
+define void @f16_factor_4(%f16.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 3
+  %tmp4 = load half, half* %tmp0, align 2
+  %tmp5 = load half, half* %tmp1, align 2
+  %tmp6 = load half, half* %tmp2, align 2
+  %tmp7 = load half, half* %tmp3, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  store half 0.0, half* %tmp2, align 2
+  store half 0.0, half* %tmp3, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.4 = type {float, float, float, float}
+define void @f32_factor_4(%f32.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 3
+  %tmp4 = load float, float* %tmp0, align 4
+  %tmp5 = load float, float* %tmp1, align 4
+  %tmp6 = load float, float* %tmp2, align 4
+  %tmp7 = load float, float* %tmp3, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  store float 0.0, float* %tmp2, align 4
+  store float 0.0, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.4 = type {double, double, double, double}
+define void @f64_factor_4(%f64.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 3
+  %tmp4 = load double, double* %tmp0, align 8
+  %tmp5 = load double, double* %tmp1, align 8
+  %tmp6 = load double, double* %tmp2, align 8
+  %tmp7 = load double, double* %tmp3, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  store double 0.0, double* %tmp2, align 8
+  store double 0.0, double* %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
 attributes #0 = { "target-features"="+mve.fp" }


        


More information about the llvm-commits mailing list