[llvm] e841bd5 - [ARM] Extra MVE unaligned VLDn tests. NFC

Sun Jan 24 13:39:17 PST 2021

Author: David Green
Date: 2021-01-24T21:39:00Z
New Revision: e841bd5f335864b8c4d81cbf4df08460ef39f2ae

URL: https://github.com/llvm/llvm-project/commit/e841bd5f335864b8c4d81cbf4df08460ef39f2ae
DIFF: https://github.com/llvm/llvm-project/commit/e841bd5f335864b8c4d81cbf4df08460ef39f2ae.diff

LOG: [ARM] Extra MVE unaligned VLDn tests. NFC

Added: 
    

Modified: 
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index f33a7237151c..b5309aab1f60 100644

--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -98,6 +98,23 @@ entry:
   ret void
 }
 
+define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vld2_v4i32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x i32>, <8 x i32>* %src, align 1
+  %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = add <4 x i32> %s1, %s2
+  store <4 x i32> %a, <4 x i32> *%dst
+  ret void
+}
+
 ; i16
 
 define void @vld2_v2i16(<4 x i16> *%src, <2 x i16> *%dst) {
@@ -115,7 +132,7 @@ define void @vld2_v2i16(<4 x i16> *%src, <2 x i16> *%dst) {
 ; CHECK-NEXT:    strh r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <4 x i16>, <4 x i16>* %src, align 4
+  %l1 = load <4 x i16>, <4 x i16>* %src, align 2
   %s1 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
   %s2 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
   %a = add <2 x i16> %s1, %s2
@@ -126,13 +143,13 @@ entry:
 define void @vld2_v4i16(<8 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-LABEL: vld2_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    vrev32.16 q1, q0
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x i16>, <8 x i16>* %src, align 4
+  %l1 = load <8 x i16>, <8 x i16>* %src, align 2
   %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %a = add <4 x i16> %s1, %s2
@@ -149,7 +166,7 @@ define void @vld2_v8i16(<16 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x i16>, <16 x i16>* %src, align 4
+  %l1 = load <16 x i16>, <16 x i16>* %src, align 2
   %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %a = add <8 x i16> %s1, %s2
@@ -170,7 +187,7 @@ define void @vld2_v16i16(<32 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x i16>, <32 x i16>* %src, align 4
+  %l1 = load <32 x i16>, <32 x i16>* %src, align 2
   %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %a = add <16 x i16> %s1, %s2
@@ -178,6 +195,23 @@ entry:
   ret void
 }
 
+define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vld2_v8i16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
+; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i16>, <16 x i16>* %src, align 1
+  %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = add <8 x i16> %s1, %s2
+  store <8 x i16> %a, <8 x i16> *%dst
+  ret void
+}
+
 ; i8
 
 define void @vld2_v2i8(<4 x i8> *%src, <2 x i8> *%dst) {
@@ -195,7 +229,7 @@ define void @vld2_v2i8(<4 x i8> *%src, <2 x i8> *%dst) {
 ; CHECK-NEXT:    strb r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <4 x i8>, <4 x i8>* %src, align 4
+  %l1 = load <4 x i8>, <4 x i8>* %src, align 1
   %s1 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
   %s2 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
   %a = add <2 x i8> %s1, %s2
@@ -212,7 +246,7 @@ define void @vld2_v4i8(<8 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x i8>, <8 x i8>* %src, align 4
+  %l1 = load <8 x i8>, <8 x i8>* %src, align 1
   %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %a = add <4 x i8> %s1, %s2
@@ -223,13 +257,13 @@ entry:
 define void @vld2_v8i8(<16 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-LABEL: vld2_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    vrev16.8 q1, q0
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vstrb.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x i8>, <16 x i8>* %src, align 4
+  %l1 = load <16 x i8>, <16 x i8>* %src, align 1
   %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %a = add <8 x i8> %s1, %s2
@@ -246,7 +280,7 @@ define void @vld2_v16i8(<32 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x i8>, <32 x i8>* %src, align 4
+  %l1 = load <32 x i8>, <32 x i8>* %src, align 1
   %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %a = add <16 x i8> %s1, %s2
@@ -286,7 +320,7 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
-  %l1 = load <4 x i64>, <4 x i64>* %src, align 4
+  %l1 = load <4 x i64>, <4 x i64>* %src, align 8
   %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
   %s2 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
   %a = add <2 x i64> %s1, %s2
@@ -350,7 +384,7 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
-  %l1 = load <8 x i64>, <8 x i64>* %src, align 4
+  %l1 = load <8 x i64>, <8 x i64>* %src, align 8
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %a = add <4 x i64> %s1, %s2
@@ -452,12 +486,30 @@ entry:
   ret void
 }
 
+define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vld2_v4f32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <8 x float>, <8 x float>* %src, align 1
+  %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %a = fadd <4 x float> %s1, %s2
+  store <4 x float> %a, <4 x float> *%dst
+  ret void
+}
+
 ; f16
 
 define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
 ; CHECK-LABEL: vld2_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldrd r2, r0, [r0]
+; CHECK-NEXT:    ldr r2, [r0]
+; CHECK-NEXT:    ldr r0, [r0, #4]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmovx.f16 s4, s1
@@ -475,7 +527,7 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <4 x half>, <4 x half>* %src, align 4
+  %l1 = load <4 x half>, <4 x half>* %src, align 2
   %s1 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 0, i32 2>
   %s2 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 1, i32 3>
   %a = fadd <2 x half> %s1, %s2
@@ -486,7 +538,7 @@ entry:
 define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmov r0, s1
@@ -513,7 +565,7 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    strd r0, r2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x half>, <8 x half>* %src, align 4
+  %l1 = load <8 x half>, <8 x half>* %src, align 2
   %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %a = fadd <4 x half> %s1, %s2
@@ -530,7 +582,7 @@ define void @vld2_v8f16(<16 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x half>, <16 x half>* %src, align 4
+  %l1 = load <16 x half>, <16 x half>* %src, align 2
   %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %a = fadd <8 x half> %s1, %s2
@@ -551,7 +603,7 @@ define void @vld2_v16f16(<32 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x half>, <32 x half>* %src, align 4
+  %l1 = load <32 x half>, <32 x half>* %src, align 2
   %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %a = fadd <16 x half> %s1, %s2
@@ -559,6 +611,23 @@ entry:
   ret void
 }
 
+define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vld2_v8f16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
+; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x half>, <16 x half>* %src, align 1
+  %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %a = fadd <8 x half> %s1, %s2
+  store <8 x half> %a, <8 x half> *%dst
+  ret void
+}
+
 ; f64
 
 define void @vld2_v2f64(<4 x double> *%src, <2 x double> *%dst) {
@@ -571,7 +640,7 @@ define void @vld2_v2f64(<4 x double> *%src, <2 x double> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <4 x double>, <4 x double>* %src, align 4
+  %l1 = load <4 x double>, <4 x double>* %src, align 8
   %s1 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 0, i32 2>
   %s2 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 1, i32 3>
   %a = fadd <2 x double> %s1, %s2
@@ -594,7 +663,7 @@ define void @vld2_v4f64(<8 x double> *%src, <4 x double> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x double>, <8 x double>* %src, align 4
+  %l1 = load <8 x double>, <8 x double>* %src, align 8
   %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %a = fadd <4 x double> %s1, %s2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index c05eaa3fa09a..298896852130 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -188,12 +188,41 @@ entry:
   ret void
 }
 
+define void @vld4_v4i32_align1(<16 x i32> *%src, <4 x i32> *%dst) {
+; CHECK-LABEL: vld4_v4i32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x i32>, <16 x i32>* %src, align 1
+  %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = add <4 x i32> %s1, %s2
+  %a2 = add <4 x i32> %s3, %s4
+  %a3 = add <4 x i32> %a1, %a2
+  store <4 x i32> %a3, <4 x i32> *%dst
+  ret void
+}
+
 ; i16
 
 define void @vld4_v2i16(<8 x i16> *%src, <2 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v2i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
 ; CHECK-NEXT:    add r0, r2
@@ -212,7 +241,7 @@ define void @vld4_v2i16(<8 x i16> *%src, <2 x i16> *%dst) {
 ; CHECK-NEXT:    strh r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x i16>, <8 x i16>* %src, align 4
+  %l1 = load <8 x i16>, <8 x i16>* %src, align 2
   %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
   %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
   %s3 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
@@ -229,8 +258,8 @@ define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
+; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
@@ -262,7 +291,7 @@ define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x i16>, <16 x i16>* %src, align 4
+  %l1 = load <16 x i16>, <16 x i16>* %src, align 2
   %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %s3 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -291,7 +320,7 @@ define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x i16>, <32 x i16>* %src, align 4
+  %l1 = load <32 x i16>, <32 x i16>* %src, align 2
   %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
   %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@@ -329,7 +358,7 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <64 x i16>, <64 x i16>* %src, align 4
+  %l1 = load <64 x i16>, <64 x i16>* %src, align 2
   %s1 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
   %s2 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
   %s3 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
@@ -341,6 +370,35 @@ entry:
   ret void
 }
 
+define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
+; CHECK-LABEL: vld4_v8i16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x i16>, <32 x i16>* %src, align 1
+  %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = add <8 x i16> %s1, %s2
+  %a2 = add <8 x i16> %s3, %s4
+  %a3 = add <8 x i16> %a1, %a2
+  store <8 x i16> %a3, <8 x i16> *%dst
+  ret void
+}
+
 ; i8
 
 define void @vld4_v2i8(<8 x i8> *%src, <2 x i8> *%dst) {
@@ -365,7 +423,7 @@ define void @vld4_v2i8(<8 x i8> *%src, <2 x i8> *%dst) {
 ; CHECK-NEXT:    strb r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x i8>, <8 x i8>* %src, align 4
+  %l1 = load <8 x i8>, <8 x i8>* %src, align 1
   %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
   %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
   %s3 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
@@ -380,7 +438,7 @@ entry:
 define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    vmov.u8 r0, q0[10]
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
@@ -395,7 +453,7 @@ define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x i8>, <16 x i8>* %src, align 4
+  %l1 = load <16 x i8>, <16 x i8>* %src, align 1
   %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %s3 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -412,8 +470,8 @@ define void @vld4_v8i8(<32 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
 ; CHECK-NEXT:    vmov.u8 r2, q1[3]
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q2[0], r2
@@ -485,7 +543,7 @@ define void @vld4_v8i8(<32 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x i8>, <32 x i8>* %src, align 4
+  %l1 = load <32 x i8>, <32 x i8>* %src, align 1
   %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
   %s3 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@@ -514,7 +572,7 @@ define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <64 x i8>, <64 x i8>* %src, align 4
+  %l1 = load <64 x i8>, <64 x i8>* %src, align 1
   %s1 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
   %s2 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
   %s3 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
@@ -585,7 +643,7 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
-  %l1 = load <8 x i64>, <8 x i64>* %src, align 4
+  %l1 = load <8 x i64>, <8 x i64>* %src, align 8
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
   %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
   %s3 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
@@ -713,7 +771,7 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
-  %l1 = load <16 x i64>, <16 x i64>* %src, align 4
+  %l1 = load <16 x i64>, <16 x i64>* %src, align 8
   %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %s2 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %s3 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -904,12 +962,41 @@ entry:
   ret void
 }
 
+define void @vld4_v4f32_align1(<16 x float> *%src, <4 x float> *%dst) {
+; CHECK-LABEL: vld4_v4f32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <16 x float>, <16 x float>* %src, align 1
+  %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %a1 = fadd <4 x float> %s1, %s2
+  %a2 = fadd <4 x float> %s3, %s4
+  %a3 = fadd <4 x float> %a1, %a2
+  store <4 x float> %a3, <4 x float> *%dst
+  ret void
+}
+
 ; f16
 
 define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK-LABEL: vld4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s1
@@ -937,7 +1024,7 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x half>, <8 x half>* %src, align 4
+  %l1 = load <8 x half>, <8 x half>* %src, align 2
   %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 0, i32 4>
   %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 1, i32 5>
   %s3 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 2, i32 6>
@@ -954,8 +1041,8 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8}
 ; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrh.u16 q1, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    vmov r3, s7
@@ -1005,7 +1092,7 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x half>, <16 x half>* %src, align 4
+  %l1 = load <16 x half>, <16 x half>* %src, align 2
   %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %s3 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -1034,7 +1121,7 @@ define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <32 x half>, <32 x half>* %src, align 4
+  %l1 = load <32 x half>, <32 x half>* %src, align 2
   %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
   %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@@ -1082,7 +1169,7 @@ define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    pop {r4, r5}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <64 x half>, <64 x half>* %src, align 4
+  %l1 = load <64 x half>, <64 x half>* %src, align 2
   %s1 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
   %s2 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
   %s3 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
@@ -1094,6 +1181,35 @@ entry:
   ret void
 }
 
+define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
+; CHECK-LABEL: vld4_v8f16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.f16 q4, q2, q3
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l1 = load <32 x half>, <32 x half>* %src, align 1
+  %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %a1 = fadd <8 x half> %s1, %s2
+  %a2 = fadd <8 x half> %s3, %s4
+  %a3 = fadd <8 x half> %a1, %a2
+  store <8 x half> %a3, <8 x half> *%dst
+  ret void
+}
+
 ; f64
 
 define void @vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) {
@@ -1112,7 +1228,7 @@ define void @vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) {
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <8 x double>, <8 x double>* %src, align 4
+  %l1 = load <8 x double>, <8 x double>* %src, align 8
   %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 0, i32 4>
   %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 1, i32 5>
   %s3 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 2, i32 6>
@@ -1154,7 +1270,7 @@ define void @vld4_v4f64(<16 x double> *%src, <4 x double> *%dst) {
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
-  %l1 = load <16 x double>, <16 x double>* %src, align 4
+  %l1 = load <16 x double>, <16 x double>* %src, align 8
   %s1 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %s2 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %s3 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 903ae4243b54..eb909810f901 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -18,7 +18,7 @@ entry:
   %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
   %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
   %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x i32> %s, <4 x i32> *%dst
+  store <4 x i32> %s, <4 x i32> *%dst, align 4
   ret void
 }
 
@@ -36,7 +36,7 @@ entry:
   %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
   %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
   %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x i32> %s, <8 x i32> *%dst
+  store <8 x i32> %s, <8 x i32> *%dst, align 4
   ret void
 }
 
@@ -58,7 +58,7 @@ entry:
   %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
   %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
   %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x i32> %s, <16 x i32> *%dst
+  store <16 x i32> %s, <16 x i32> *%dst, align 4
   ret void
 }
 
@@ -93,7 +93,25 @@ entry:
   %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
   %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
   %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  store <32 x i32> %s, <32 x i32> *%dst
+  store <32 x i32> %s, <32 x i32> *%dst, align 4
+  ret void
+}
+
+define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) {
+; CHECK-LABEL: vst2_v4i32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
+; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
+  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
+  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
+  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
+  %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %s, <8 x i32> *%dst, align 1
   ret void
 }
 
@@ -116,7 +134,7 @@ entry:
   %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
   %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
   %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x i16> %s, <4 x i16> *%dst
+  store <4 x i16> %s, <4 x i16> *%dst, align 2
   ret void
 }
 
@@ -126,7 +144,7 @@ define void @vst2_v4i16(<4 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0]
 ; CHECK-NEXT:    vmovnt.i32 q1, q0
-; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
@@ -134,7 +152,7 @@ entry:
   %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
   %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
   %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x i16> %s, <8 x i16> *%dst
+  store <8 x i16> %s, <8 x i16> *%dst, align 2
   ret void
 }
 
@@ -152,7 +170,7 @@ entry:
   %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
   %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
   %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x i16> %s, <16 x i16> *%dst
+  store <16 x i16> %s, <16 x i16> *%dst, align 2
   ret void
 }
 
@@ -174,7 +192,25 @@ entry:
   %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
   %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
   %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  store <32 x i16> %s, <32 x i16> *%dst
+  store <32 x i16> %s, <32 x i16> *%dst, align 2
+  ret void
+}
+
+define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
+; CHECK-LABEL: vst2_v8i16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
+; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
+  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
+  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
+  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
+  %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i16> %s, <16 x i16> *%dst, align 1
   ret void
 }
 
@@ -197,7 +233,7 @@ entry:
   %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
   %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
   %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x i8> %s, <4 x i8> *%dst
+  store <4 x i8> %s, <4 x i8> *%dst, align 1
   ret void
 }
 
@@ -215,7 +251,7 @@ entry:
   %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
   %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
   %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x i8> %s, <8 x i8> *%dst
+  store <8 x i8> %s, <8 x i8> *%dst, align 1
   ret void
 }
 
@@ -225,7 +261,7 @@ define void @vst2_v8i8(<8 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
 ; CHECK-NEXT:    vldrb.u16 q1, [r0]
 ; CHECK-NEXT:    vmovnt.i16 q1, q0
-; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vstrb.8 q1, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
@@ -233,7 +269,7 @@ entry:
   %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
   %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
   %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x i8> %s, <16 x i8> *%dst
+  store <16 x i8> %s, <16 x i8> *%dst, align 1
   ret void
 }
 
@@ -251,7 +287,7 @@ entry:
   %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
   %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
   %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  store <32 x i8> %s, <32 x i8> *%dst
+  store <32 x i8> %s, <32 x i8> *%dst, align 1
   ret void
 }
 
@@ -277,7 +313,7 @@ entry:
   %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
   %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
   %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x i64> %s, <4 x i64> *%dst
+  store <4 x i64> %s, <4 x i64> *%dst, align 8
   ret void
 }
 
@@ -314,7 +350,7 @@ entry:
   %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
   %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
   %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x i64> %s, <8 x i64> *%dst
+  store <8 x i64> %s, <8 x i64> *%dst, align 8
   ret void
 }
 
@@ -335,7 +371,7 @@ entry:
   %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
   %l2 = load <2 x float>, <2 x float>* %s2, align 4
   %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x float> %s, <4 x float> *%dst
+  store <4 x float> %s, <4 x float> *%dst, align 4
   ret void
 }
 
@@ -353,7 +389,7 @@ entry:
   %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
   %l2 = load <4 x float>, <4 x float>* %s2, align 4
   %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x float> %s, <8 x float> *%dst
+  store <8 x float> %s, <8 x float> *%dst, align 4
   ret void
 }
 
@@ -375,7 +411,7 @@ entry:
   %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
   %l2 = load <8 x float>, <8 x float>* %s2, align 4
   %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x float> %s, <16 x float> *%dst
+  store <16 x float> %s, <16 x float> *%dst, align 4
   ret void
 }
 
@@ -410,7 +446,25 @@ entry:
   %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
   %l2 = load <16 x float>, <16 x float>* %s2, align 4
   %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  store <32 x float> %s, <32 x float> *%dst
+  store <32 x float> %s, <32 x float> *%dst, align 4
+  ret void
+}
+
+define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) {
+; CHECK-LABEL: vst2_v4f32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
+; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
+  %l1 = load <4 x float>, <4 x float>* %s1, align 4
+  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
+  %l2 = load <4 x float>, <4 x float>* %s2, align 4
+  %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %s, <8 x float> *%dst, align 1
   ret void
 }
 
@@ -432,9 +486,10 @@ define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r2, s9
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strd r0, r2, [r1]
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    str r2, [r1, #4]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -442,7 +497,7 @@ entry:
   %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
   %l2 = load <2 x half>, <2 x half>* %s2, align 4
   %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x half> %s, <4 x half> *%dst
+  store <4 x half> %s, <4 x half> *%dst, align 2
   ret void
 }
 
@@ -475,7 +530,7 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vmov.16 q1[6], r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -483,7 +538,7 @@ entry:
   %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
   %l2 = load <4 x half>, <4 x half>* %s2, align 4
   %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x half> %s, <8 x half> *%dst
+  store <8 x half> %s, <8 x half> *%dst, align 2
   ret void
 }
 
@@ -501,7 +556,7 @@ entry:
   %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
   %l2 = load <8 x half>, <8 x half>* %s2, align 4
   %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x half> %s, <16 x half> *%dst
+  store <16 x half> %s, <16 x half> *%dst, align 2
   ret void
 }
 
@@ -523,7 +578,25 @@ entry:
   %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
   %l2 = load <16 x half>, <16 x half>* %s2, align 4
   %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  store <32 x half> %s, <32 x half> *%dst
+  store <32 x half> %s, <32 x half> *%dst, align 2
+  ret void
+}
+
+define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
+; CHECK-LABEL: vst2_v8f16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
+; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
+  %l1 = load <8 x half>, <8 x half>* %s1, align 4
+  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
+  %l2 = load <8 x half>, <8 x half>* %s2, align 4
+  %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x half> %s, <16 x half> *%dst, align 1
   ret void
 }
 
@@ -546,7 +619,7 @@ entry:
   %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
   %l2 = load <2 x double>, <2 x double>* %s2, align 4
   %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x double> %s, <4 x double> *%dst
+  store <4 x double> %s, <4 x double> *%dst, align 8
   ret void
 }
 
@@ -577,6 +650,6 @@ entry:
   %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
   %l2 = load <4 x double>, <4 x double>* %s2, align 4
   %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-  store <8 x double> %s, <8 x double> *%dst
+  store <8 x double> %s, <8 x double> *%dst, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 078bdc762dc0..120c0d6a0e9b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -41,7 +41,7 @@ entry:
   %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x i32> %s, <8 x i32> *%dst
+  store <8 x i32> %s, <8 x i32> *%dst, align 4
   ret void
 }
 
@@ -69,7 +69,7 @@ entry:
   %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i32> %s, <16 x i32> *%dst
+  store <16 x i32> %s, <16 x i32> *%dst, align 4
   ret void
 }
 
@@ -108,7 +108,7 @@ entry:
   %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-  store <32 x i32> %s, <32 x i32> *%dst
+  store <32 x i32> %s, <32 x i32> *%dst, align 4
   ret void
 }
 
@@ -196,7 +196,35 @@ entry:
   %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-  store <64 x i32> %s, <64 x i32> *%dst
+  store <64 x i32> %s, <64 x i32> *%dst, align 4
+  ret void
+}
+
+define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) {
+; CHECK-LABEL: vst4_v4i32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
+  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
+  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
+  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
+  %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
+  %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
+  %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3
+  %l4 = load <4 x i32>, <4 x i32>* %s3, align 4
+  %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %s, <16 x i32> *%dst, align 1
   ret void
 }
 
@@ -223,7 +251,7 @@ define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    vmov.16 q0[5], lr
 ; CHECK-NEXT:    vmov.16 q0[6], r4
 ; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
@@ -237,7 +265,7 @@ entry:
   %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x i16> %s, <8 x i16> *%dst
+  store <8 x i16> %s, <8 x i16> *%dst, align 2
   ret void
 }
 
@@ -269,7 +297,7 @@ define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vmov.16 q4[1], r0
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vstrh.16 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q4[3], r0
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov.16 q4[4], r0
@@ -278,7 +306,7 @@ define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vmov r0, s13
 ; CHECK-NEXT:    vmov.16 q4[6], r0
 ; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vstrh.16 q4, [r1]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -293,7 +321,7 @@ entry:
   %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i16> %s, <16 x i16> *%dst
+  store <16 x i16> %s, <16 x i16> *%dst, align 2
   ret void
 }
 
@@ -321,7 +349,7 @@ entry:
   %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-  store <32 x i16> %s, <32 x i16> *%dst
+  store <32 x i16> %s, <32 x i16> *%dst, align 2
   ret void
 }
 
@@ -360,7 +388,35 @@ entry:
   %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-  store <64 x i16> %s, <64 x i16> *%dst
+  store <64 x i16> %s, <64 x i16> *%dst, align 2
+  ret void
+}
+
+define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
+; CHECK-LABEL: vst4_v8i16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
+  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
+  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
+  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
+  %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
+  %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
+  %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3
+  %l4 = load <8 x i16>, <8 x i16>* %s3, align 4
+  %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i16> %s, <32 x i16> *%dst, align 1
   ret void
 }
 
@@ -401,7 +457,7 @@ entry:
   %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x i8> %s, <8 x i8> *%dst
+  store <8 x i8> %s, <8 x i8> *%dst, align 1
   ret void
 }
 
@@ -439,7 +495,7 @@ define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.8 q0[14], r0
 ; CHECK-NEXT:    vmov.8 q0[15], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
@@ -453,7 +509,7 @@ entry:
   %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i8> %s, <16 x i8> *%dst
+  store <16 x i8> %s, <16 x i8> *%dst, align 1
   ret void
 }
 
@@ -499,7 +555,7 @@ define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.8 q4[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[0]
 ; CHECK-NEXT:    vmov.8 q4[2], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.8 q4[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.8 q4[4], r0
@@ -522,7 +578,7 @@ define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u16 r0, q3[3]
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vstrb.8 q4, [r1]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -537,7 +593,7 @@ entry:
   %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-  store <32 x i8> %s, <32 x i8> *%dst
+  store <32 x i8> %s, <32 x i8> *%dst, align 1
   ret void
 }
 
@@ -565,7 +621,7 @@ entry:
   %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-  store <64 x i8> %s, <64 x i8> *%dst
+  store <64 x i8> %s, <64 x i8> *%dst, align 1
   ret void
 }
 
@@ -606,7 +662,7 @@ entry:
   %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x i64> %s, <8 x i64> *%dst
+  store <8 x i64> %s, <8 x i64> *%dst, align 8
   ret void
 }
 
@@ -667,7 +723,7 @@ entry:
   %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i64> %s, <16 x i64> *%dst
+  store <16 x i64> %s, <16 x i64> *%dst, align 8
   ret void
 }
 
@@ -699,7 +755,7 @@ entry:
   %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x float> %s, <8 x float> *%dst
+  store <8 x float> %s, <8 x float> *%dst, align 4
   ret void
 }
 
@@ -727,7 +783,7 @@ entry:
   %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x float> %s, <16 x float> *%dst
+  store <16 x float> %s, <16 x float> *%dst, align 4
   ret void
 }
 
@@ -766,7 +822,7 @@ entry:
   %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-  store <32 x float> %s, <32 x float> *%dst
+  store <32 x float> %s, <32 x float> *%dst, align 4
   ret void
 }
 
@@ -854,7 +910,35 @@ entry:
   %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-  store <64 x float> %s, <64 x float> *%dst
+  store <64 x float> %s, <64 x float> *%dst, align 4
+  ret void
+}
+
+define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) {
+; CHECK-LABEL: vst4_v4f32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
+  %l1 = load <4 x float>, <4 x float>* %s1, align 4
+  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
+  %l2 = load <4 x float>, <4 x float>* %s2, align 4
+  %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
+  %l3 = load <4 x float>, <4 x float>* %s3, align 4
+  %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3
+  %l4 = load <4 x float>, <4 x float>* %s3, align 4
+  %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x float> %s, <16 x float> *%dst, align 1
   ret void
 }
 
@@ -886,7 +970,7 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vmov.16 q0[6], r0
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -900,7 +984,7 @@ entry:
   %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x half> %s, <8 x half> *%dst
+  store <8 x half> %s, <8 x half> *%dst, align 2
   ret void
 }
 
@@ -937,7 +1021,7 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.16 q2[7], r0
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
 ; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.16 q2[0], r2
 ; CHECK-NEXT:    vmovx.f16 s12, s4
@@ -957,7 +1041,7 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vstrh.16 q2, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -971,7 +1055,7 @@ entry:
   %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x half> %s, <16 x half> *%dst
+  store <16 x half> %s, <16 x half> *%dst, align 2
   ret void
 }
 
@@ -999,7 +1083,7 @@ entry:
   %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-  store <32 x half> %s, <32 x half> *%dst
+  store <32 x half> %s, <32 x half> *%dst, align 2
   ret void
 }
 
@@ -1038,7 +1122,35 @@ entry:
   %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-  store <64 x half> %s, <64 x half> *%dst
+  store <64 x half> %s, <64 x half> *%dst, align 2
+  ret void
+}
+
+define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
+; CHECK-LABEL: vst4_v8f16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vst40.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst41.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst42.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    vst43.16 {q0, q1, q2, q3}, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
+  %l1 = load <8 x half>, <8 x half>* %s1, align 4
+  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
+  %l2 = load <8 x half>, <8 x half>* %s2, align 4
+  %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
+  %l3 = load <8 x half>, <8 x half>* %s3, align 4
+  %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3
+  %l4 = load <8 x half>, <8 x half>* %s3, align 4
+  %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x half> %s, <32 x half> *%dst, align 1
   ret void
 }
 
@@ -1073,7 +1185,7 @@ entry:
   %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x double> %s, <8 x double> *%dst
+  store <8 x double> %s, <8 x double> *%dst, align 8
   ret void
 }
 
@@ -1122,6 +1234,6 @@ entry:
   %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x double> %s, <16 x double> *%dst
+  store <16 x double> %s, <16 x double> *%dst, align 8
   ret void
 }