[llvm] r368255 - [ARM] Rejig MVE load store tests. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 7 22:58:48 PDT 2019


Author: dmgreen
Date: Wed Aug  7 22:58:48 2019
New Revision: 368255

URL: http://llvm.org/viewvc/llvm-project?rev=368255&view=rev
Log:
[ARM] Rejig MVE load store tests. NFC

This adjusts the load/store tests for better testing of alignments. It also
adds some extra alignment 1 tests, useful for future commits.

Modified:
    llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-ldst-regimm.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll?rev=368255&r1=368254&r2=368255&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-offset.ll Wed Aug  7 22:58:48 2019
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
 
-define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_4:
+define i8* @ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -10,14 +10,14 @@ define i8* @post_ldrwu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_3:
+define i8* @ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
@@ -26,14 +26,14 @@ define i8* @post_ldrwu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m4:
+define i8* @ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #-4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -41,14 +41,14 @@ define i8* @post_ldrwu32_m4(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -4
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_508:
+define i8* @ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
@@ -57,14 +57,14 @@ define i8* @post_ldrwu32_508(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 508
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_512:
+define i8* @ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
@@ -73,14 +73,14 @@ define i8* @post_ldrwu32_512(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 512
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m508:
+define i8* @ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r2, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
@@ -89,14 +89,14 @@ define i8* @post_ldrwu32_m508(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -508
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m512:
+define i8* @ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r2, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
@@ -105,15 +105,15 @@ define i8* @post_ldrwu32_m512(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -512
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %x
 }
 
 
-define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_4:
+define i8* @ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -121,15 +121,15 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_3:
+define i8* @ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrh.u32 q0, [r2]
@@ -138,15 +138,15 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_2:
+define i8* @ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #2
 ; CHECK-NEXT:    vldrh.u32 q0, [r2]
@@ -155,15 +155,15 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_254:
+define i8* @ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #254
 ; CHECK-NEXT:    vldrh.u32 q0, [r2]
@@ -172,15 +172,15 @@ define i8* @post_ldrhu32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_256:
+define i8* @ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #256
 ; CHECK-NEXT:    vldrh.u32 q0, [r2]
@@ -189,16 +189,16 @@ define i8* @post_ldrhu32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
 
-define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_4:
+define i8* @ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -206,15 +206,15 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_3:
+define i8* @ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrh.s32 q0, [r2]
@@ -223,15 +223,15 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_2:
+define i8* @ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #2
 ; CHECK-NEXT:    vldrh.s32 q0, [r2]
@@ -240,15 +240,15 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_254:
+define i8* @ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #254
 ; CHECK-NEXT:    vldrh.s32 q0, [r2]
@@ -257,15 +257,15 @@ define i8* @post_ldrhs32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_256:
+define i8* @ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #256
 ; CHECK-NEXT:    vldrh.s32 q0, [r2]
@@ -274,96 +274,95 @@ define i8* @post_ldrhs32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
 
-define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_4:
+define i8* @ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_3:
+define i8* @ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_2:
+define i8* @ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    adds r2, r0, #2
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_254:
+define i8* @ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #254
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_256:
+define i8* @ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #256
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %x
 }
 
 
-define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_4:
+define i8* @ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -371,15 +370,15 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_3:
+define i8* @ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrb.u32 q0, [r2]
@@ -388,15 +387,15 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_127:
+define i8* @ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #127
 ; CHECK-NEXT:    vldrb.u32 q0, [r2]
@@ -405,15 +404,15 @@ define i8* @post_ldrbu32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_128:
+define i8* @ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #128
 ; CHECK-NEXT:    vldrb.u32 q0, [r2]
@@ -422,16 +421,16 @@ define i8* @post_ldrbu32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
 
-define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_4:
+define i8* @ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -439,15 +438,15 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_3:
+define i8* @ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrb.s32 q0, [r2]
@@ -456,15 +455,15 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_127:
+define i8* @ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #127
 ; CHECK-NEXT:    vldrb.s32 q0, [r2]
@@ -473,15 +472,15 @@ define i8* @post_ldrbs32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_128:
+define i8* @ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #128
 ; CHECK-NEXT:    vldrb.s32 q0, [r2]
@@ -490,214 +489,214 @@ define i8* @post_ldrbs32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %x
 }
 
 
-define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_4:
+define i8* @ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0, #4]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_3:
+define i8* @ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrb.u16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_127:
+define i8* @ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #127
 ; CHECK-NEXT:    vldrb.u16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_128:
+define i8* @ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #128
 ; CHECK-NEXT:    vldrb.u16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
 
-define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_4:
+define i8* @ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0, #4]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_3:
+define i8* @ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r2, r0, #3
 ; CHECK-NEXT:    vldrb.s16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_127:
+define i8* @ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #127
 ; CHECK-NEXT:    vldrb.s16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
-define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_128:
+define i8* @ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #128
 ; CHECK-NEXT:    vldrb.s16 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %x
 }
 
 
-define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_4:
+define i8* @ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #4]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %x
 }
 
-define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_3:
+define i8* @ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    adds r2, r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %x
 }
 
-define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_127:
+define i8* @ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r2, r0, #127
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %x
 }
 
-define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_128:
+define i8* @ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add.w r2, r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %x
 }
 
-define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf32_4:
+define i8* @ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -705,24 +704,101 @@ define i8* @post_ldrwf32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %y to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %x
 }
 
-define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf16_4:
+define i8* @ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrwi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhi16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r2, r0, #3
+; CHECK-NEXT:    vldrh.s32 q0, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  %3 = sext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %3, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 1
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 1
   %2 = bitcast i8* %y to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 2
   ret i8* %x
 }
 
@@ -730,8 +806,8 @@ entry:
 
 
 
-define i8* @post_strw32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_4:
+define i8* @strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
@@ -739,14 +815,14 @@ define i8* @post_strw32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_3:
+define i8* @strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
@@ -755,14 +831,14 @@ define i8* @post_strw32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_m4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m4:
+define i8* @strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #-4]
@@ -770,14 +846,14 @@ define i8* @post_strw32_m4(i8* %y, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_508:
+define i8* @strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #508
@@ -786,14 +862,14 @@ define i8* @post_strw32_508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_512:
+define i8* @strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #512
@@ -802,14 +878,14 @@ define i8* @post_strw32_512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_m508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m508:
+define i8* @strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    sub.w r1, r0, #508
@@ -818,14 +894,14 @@ define i8* @post_strw32_m508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strw32_m512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m512:
+define i8* @strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    sub.w r1, r0, #512
@@ -834,15 +910,15 @@ define i8* @post_strw32_m512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %y
 }
 
 
-define i8* @post_strh32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_4:
+define i8* @strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0, #4]
@@ -850,14 +926,14 @@ define i8* @post_strh32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_3:
+define i8* @strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
@@ -866,14 +942,14 @@ define i8* @post_strh32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh32_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_2:
+define i8* @strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #2
@@ -882,14 +958,14 @@ define i8* @post_strh32_2(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh32_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_254:
+define i8* @strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #254
@@ -898,14 +974,14 @@ define i8* @post_strh32_254(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh32_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_256:
+define i8* @strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #256
@@ -914,95 +990,94 @@ define i8* @post_strh32_256(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %y
 }
 
 
-define i8* @post_strh16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_4:
+define i8* @strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #4]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_3:
+define i8* @strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh16_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_2:
+define i8* @strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adds r1, r0, #2
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #2]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh16_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_254:
+define i8* @strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_254:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #254
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %y
 }
 
-define i8* @post_strh16_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_256:
+define i8* @strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_256:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #256
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %y
 }
 
 
-define i8* @post_strb32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_4:
+define i8* @strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0, #4]
@@ -1010,14 +1085,14 @@ define i8* @post_strb32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_3:
+define i8* @strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
@@ -1026,14 +1101,14 @@ define i8* @post_strb32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb32_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_127:
+define i8* @strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #127
@@ -1042,14 +1117,14 @@ define i8* @post_strb32_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb32_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_128:
+define i8* @strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #128
@@ -1058,15 +1133,15 @@ define i8* @post_strb32_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %y
 }
 
 
-define i8* @post_strb16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_4:
+define i8* @strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0, #4]
@@ -1074,14 +1149,14 @@ define i8* @post_strb16_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_3:
+define i8* @strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
@@ -1090,14 +1165,14 @@ define i8* @post_strb16_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb16_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_127:
+define i8* @strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #127
@@ -1106,14 +1181,14 @@ define i8* @post_strb16_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb16_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_128:
+define i8* @strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #128
@@ -1122,77 +1197,77 @@ define i8* @post_strb16_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %y
 }
 
 
-define i8* @post_strb8_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_4:
+define i8* @strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #4]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb8_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_3:
+define i8* @strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb8_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_127:
+define i8* @strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_127:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
 ; CHECK-NEXT:    add.w r1, r0, #127
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strb8_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_128:
+define i8* @strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #128]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #128
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %y
 }
 
-define i8* @post_strf32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf32_4:
+define i8* @strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
@@ -1200,23 +1275,100 @@ define i8* @post_strf32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %z to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %y
 }
 
-define i8* @post_strf16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf16_4:
+define i8* @strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #4]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %y
+}
+
+define i8* @strwi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 1
+  ret i8* %y
+}
+
+define i8* @strhi16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 1
+  ret i8* %y
+}
+
+define i8* @strhi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vstrh.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i16>*
+  %3 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %3, <4 x i16>* %2, align 1
+  ret i8* %y
+}
+
+define i8* @strf32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 1
+  ret i8* %y
+}
+
+define i8* @strf16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
   %2 = bitcast i8* %z to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 1
   ret i8* %y
 }

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll?rev=368255&r1=368254&r2=368255&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-postinc.ll Wed Aug  7 22:58:48 2019
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
 
-define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_4:
+define i8* @ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -11,14 +11,14 @@ define i8* @post_ldrwu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_3:
+define i8* @ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
@@ -27,14 +27,14 @@ define i8* @post_ldrwu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m4:
+define i8* @ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    subs r0, #4
@@ -43,14 +43,14 @@ define i8* @post_ldrwu32_m4(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_508:
+define i8* @ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #508
@@ -59,14 +59,14 @@ define i8* @post_ldrwu32_508(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_512:
+define i8* @ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #512
@@ -75,14 +75,14 @@ define i8* @post_ldrwu32_512(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m508:
+define i8* @ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    sub.w r0, r0, #508
@@ -91,14 +91,14 @@ define i8* @post_ldrwu32_m508(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m512:
+define i8* @ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    sub.w r0, r0, #512
@@ -107,15 +107,15 @@ define i8* @post_ldrwu32_m512(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_4:
+define i8* @ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -124,15 +124,15 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_3:
+define i8* @ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
@@ -141,15 +141,15 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_2:
+define i8* @ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #2
@@ -158,15 +158,15 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_254:
+define i8* @ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #254
@@ -175,15 +175,15 @@ define i8* @post_ldrhu32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_256:
+define i8* @ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #256
@@ -192,16 +192,16 @@ define i8* @post_ldrhu32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_4:
+define i8* @ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -210,15 +210,15 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_3:
+define i8* @ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
@@ -227,15 +227,15 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_2:
+define i8* @ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #2
@@ -244,15 +244,15 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_254:
+define i8* @ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #254
@@ -261,15 +261,15 @@ define i8* @post_ldrhs32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_256:
+define i8* @ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #256
@@ -278,97 +278,97 @@ define i8* @post_ldrhs32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_4:
+define i8* @ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_3:
+define i8* @ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_2:
+define i8* @ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #2
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_254:
+define i8* @ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_254:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #254
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_256:
+define i8* @ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_256:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #256
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_4:
+define i8* @ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -377,15 +377,15 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_3:
+define i8* @ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
@@ -394,15 +394,15 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_127:
+define i8* @ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
@@ -411,15 +411,15 @@ define i8* @post_ldrbu32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_128:
+define i8* @ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
@@ -428,16 +428,16 @@ define i8* @post_ldrbu32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_4:
+define i8* @ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -446,15 +446,15 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_3:
+define i8* @ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
@@ -463,15 +463,15 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_127:
+define i8* @ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
@@ -480,15 +480,15 @@ define i8* @post_ldrbs32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_128:
+define i8* @ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
@@ -497,218 +497,218 @@ define i8* @post_ldrbs32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_4:
+define i8* @ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_3:
+define i8* @ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_127:
+define i8* @ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_128:
+define i8* @ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_4:
+define i8* @ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_3:
+define i8* @ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_127:
+define i8* @ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_128:
+define i8* @ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_4:
+define i8* @ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_3:
+define i8* @ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_127:
+define i8* @ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_127:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_128:
+define i8* @ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf32_4:
+define i8* @ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
@@ -717,25 +717,106 @@ define i8* @post_ldrwf32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %y to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf16_4:
+define i8* @ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrwi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhi16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  %3 = sext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %3, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrf32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 1
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrf16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 1
   %2 = bitcast i8* %y to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 2
   ret i8* %z
 }
 
@@ -743,8 +824,8 @@ entry:
 
 
 
-define i8* @post_strw32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_4:
+define i8* @strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -753,14 +834,14 @@ define i8* @post_strw32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_3:
+define i8* @strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -769,14 +850,14 @@ define i8* @post_strw32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m4:
+define i8* @strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -785,14 +866,14 @@ define i8* @post_strw32_m4(i8* %y, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_508:
+define i8* @strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -801,14 +882,14 @@ define i8* @post_strw32_508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_512:
+define i8* @strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -817,14 +898,14 @@ define i8* @post_strw32_512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m508:
+define i8* @strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -833,14 +914,14 @@ define i8* @post_strw32_m508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m512:
+define i8* @strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -849,15 +930,15 @@ define i8* @post_strw32_m512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
 
-define i8* @post_strh32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_4:
+define i8* @strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0]
@@ -866,14 +947,14 @@ define i8* @post_strh32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %y to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_3:
+define i8* @strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0]
@@ -882,14 +963,14 @@ define i8* @post_strh32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %y to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_2:
+define i8* @strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0]
@@ -898,14 +979,14 @@ define i8* @post_strh32_2(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %y to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_254:
+define i8* @strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0]
@@ -914,14 +995,14 @@ define i8* @post_strh32_254(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %y to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_256:
+define i8* @strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0]
@@ -930,96 +1011,96 @@ define i8* @post_strh32_256(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %y to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_strh16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_4:
+define i8* @strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_3:
+define i8* @strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_2:
+define i8* @strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_254:
+define i8* @strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_254:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #254
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_256:
+define i8* @strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_256:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r0, #256
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_strb32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_4:
+define i8* @strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0]
@@ -1028,14 +1109,14 @@ define i8* @post_strb32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %y to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_3:
+define i8* @strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0]
@@ -1044,14 +1125,14 @@ define i8* @post_strb32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %y to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_127:
+define i8* @strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0]
@@ -1060,14 +1141,14 @@ define i8* @post_strb32_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %y to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_128:
+define i8* @strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0]
@@ -1076,15 +1157,15 @@ define i8* @post_strb32_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %y to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
 
-define i8* @post_strb16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_4:
+define i8* @strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
@@ -1093,14 +1174,14 @@ define i8* @post_strb16_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %y to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_3:
+define i8* @strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
@@ -1109,14 +1190,14 @@ define i8* @post_strb16_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %y to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_127:
+define i8* @strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
@@ -1125,14 +1206,14 @@ define i8* @post_strb16_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %y to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_128:
+define i8* @strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
@@ -1141,79 +1222,79 @@ define i8* @post_strb16_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %y to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
 
-define i8* @post_strb8_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_4:
+define i8* @strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_3:
+define i8* @strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_127:
+define i8* @strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_127:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_128:
+define i8* @strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strf32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf32_4:
+define i8* @strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
@@ -1222,24 +1303,105 @@ define i8* @post_strf32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %y to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strf16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf16_4:
+define i8* @strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @strwi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strhi16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strhi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i16>*
+  %3 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %3, <4 x i16>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strf32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strf16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
   %2 = bitcast i8* %y to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 1
   ret i8* %z
 }

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll?rev=368255&r1=368254&r2=368255&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-preinc.ll Wed Aug  7 22:58:48 2019
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
 
-define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_4:
+define i8* @ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -11,14 +11,14 @@ define i8* @post_ldrwu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_3:
+define i8* @ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -27,14 +27,14 @@ define i8* @post_ldrwu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m4:
+define i8* @ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #-4]
 ; CHECK-NEXT:    subs r0, #4
@@ -43,14 +43,14 @@ define i8* @post_ldrwu32_m4(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -4
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_508:
+define i8* @ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -59,14 +59,14 @@ define i8* @post_ldrwu32_508(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 508
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_512:
+define i8* @ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -75,14 +75,14 @@ define i8* @post_ldrwu32_512(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 512
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m508:
+define i8* @ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r0, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -91,14 +91,14 @@ define i8* @post_ldrwu32_m508(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -508
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwu32_m512:
+define i8* @ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwu32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r0, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -107,15 +107,15 @@ define i8* @post_ldrwu32_m512(i8* %x, i8
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 -512
   %0 = bitcast i8* %z to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_4:
+define i8* @ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -124,15 +124,15 @@ define i8* @post_ldrhu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_3:
+define i8* @ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
@@ -141,15 +141,15 @@ define i8* @post_ldrhu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_2:
+define i8* @ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #2
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
@@ -158,15 +158,15 @@ define i8* @post_ldrhu32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_254:
+define i8* @ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #254
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
@@ -175,15 +175,15 @@ define i8* @post_ldrhu32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu32_256:
+define i8* @ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #256
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
@@ -192,16 +192,16 @@ define i8* @post_ldrhu32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_4:
+define i8* @ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -210,15 +210,15 @@ define i8* @post_ldrhs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_3:
+define i8* @ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
@@ -227,15 +227,15 @@ define i8* @post_ldrhs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_2:
+define i8* @ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #2
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
@@ -244,15 +244,15 @@ define i8* @post_ldrhs32_2(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_254:
+define i8* @ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #254
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
@@ -261,15 +261,15 @@ define i8* @post_ldrhs32_254(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhs32_256:
+define i8* @ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhs32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #256
 ; CHECK-NEXT:    vldrh.s32 q0, [r0]
@@ -278,97 +278,97 @@ define i8* @post_ldrhs32_256(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_4:
+define i8* @ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_3:
+define i8* @ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_2:
+define i8* @ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_2:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #2]
 ; CHECK-NEXT:    adds r0, #2
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 2
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_254:
+define i8* @ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #254
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 254
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrhu16_256:
+define i8* @ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhu16_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #256
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 256
   %0 = bitcast i8* %z to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_4:
+define i8* @ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -377,15 +377,15 @@ define i8* @post_ldrbu32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_3:
+define i8* @ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
@@ -394,15 +394,15 @@ define i8* @post_ldrbu32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_127:
+define i8* @ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
@@ -411,15 +411,15 @@ define i8* @post_ldrbu32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu32_128:
+define i8* @ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.u32 q0, [r0]
@@ -428,16 +428,16 @@ define i8* @post_ldrbu32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_4:
+define i8* @ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -446,15 +446,15 @@ define i8* @post_ldrbs32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_3:
+define i8* @ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
@@ -463,15 +463,15 @@ define i8* @post_ldrbs32_3(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_127:
+define i8* @ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
@@ -480,15 +480,15 @@ define i8* @post_ldrbs32_127(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs32_128:
+define i8* @ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.s32 q0, [r0]
@@ -497,218 +497,218 @@ define i8* @post_ldrbs32_128(i8* %x, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = bitcast i8* %y to <4 x i32>*
-  store <4 x i32> %2, <4 x i32>* %3, align 8
+  store <4 x i32> %2, <4 x i32>* %3, align 4
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_4:
+define i8* @ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_3:
+define i8* @ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_127:
+define i8* @ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu16_128:
+define i8* @ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.u16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_4:
+define i8* @ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_3:
+define i8* @ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_127:
+define i8* @ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
-define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbs16_128:
+define i8* @ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbs16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.s16 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = bitcast i8* %y to <8 x i16>*
-  store <8 x i16> %2, <8 x i16>* %3, align 8
+  store <8 x i16> %2, <8 x i16>* %3, align 2
   ret i8* %z
 }
 
 
-define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_4:
+define i8* @ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_3:
+define i8* @ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_3:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 3
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_127:
+define i8* @ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 127
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrbu8_128:
+define i8* @ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrbu8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 128
   %0 = bitcast i8* %z to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %y to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf32_4:
+define i8* @ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
@@ -717,25 +717,106 @@ define i8* @post_ldrwf32_4(i8* %x, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %y to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
-; CHECK-LABEL: post_ldrwf16_4:
+define i8* @ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrwi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhi16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhi32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i32>*
+  %3 = sext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %3, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrf32_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 1
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrf16_align1(i8* %x, i8* %y) {
+; CHECK-LABEL: ldrf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 1
   %2 = bitcast i8* %y to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 2
   ret i8* %z
 }
 
@@ -743,8 +824,8 @@ entry:
 
 
 
-define i8* @post_strw32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_4:
+define i8* @strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
@@ -753,14 +834,14 @@ define i8* @post_strw32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_3:
+define i8* @strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
@@ -769,14 +850,14 @@ define i8* @post_strw32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m4:
+define i8* @strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #-4]
@@ -785,14 +866,14 @@ define i8* @post_strw32_m4(i8* %y, i8* %
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -4
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_508:
+define i8* @strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
@@ -801,14 +882,14 @@ define i8* @post_strw32_508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_512:
+define i8* @strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
@@ -817,14 +898,14 @@ define i8* @post_strw32_512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m508(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m508:
+define i8* @strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m508:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r0, r0, #508
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
@@ -833,14 +914,14 @@ define i8* @post_strw32_m508(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -508
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strw32_m512(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strw32_m512:
+define i8* @strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: strw32_m512:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    sub.w r0, r0, #512
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
@@ -849,15 +930,15 @@ define i8* @post_strw32_m512(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 -512
   %0 = bitcast i8* %x to <4 x i32>*
-  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
   %2 = bitcast i8* %z to <4 x i32>*
-  store <4 x i32> %1, <4 x i32>* %2, align 8
+  store <4 x i32> %1, <4 x i32>* %2, align 4
   ret i8* %z
 }
 
 
-define i8* @post_strh32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_4:
+define i8* @strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
 ; CHECK-NEXT:    vstrh.32 q0, [r0, #4]
@@ -866,14 +947,14 @@ define i8* @post_strh32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_3:
+define i8* @strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
@@ -882,14 +963,14 @@ define i8* @post_strh32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_2:
+define i8* @strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #2
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
@@ -898,14 +979,14 @@ define i8* @post_strh32_2(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_254:
+define i8* @strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #254
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
@@ -914,14 +995,14 @@ define i8* @post_strh32_254(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh32_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh32_256:
+define i8* @strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh32_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #256
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
@@ -930,96 +1011,96 @@ define i8* @post_strh32_256(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <4 x i16>*
-  %1 = load <4 x i16>, <4 x i16>* %0, align 8
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
   %2 = bitcast i8* %z to <4 x i16>*
-  store <4 x i16> %1, <4 x i16>* %2, align 8
+  store <4 x i16> %1, <4 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_strh16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_4:
+define i8* @strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_3:
+define i8* @strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_2(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_2:
+define i8* @strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_2:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #2]
 ; CHECK-NEXT:    adds r0, #2
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 2
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_254(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_254:
+define i8* @strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_254:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #254
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 254
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
-define i8* @post_strh16_256(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strh16_256:
+define i8* @strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: strh16_256:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    add.w r0, r0, #256
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 256
   %0 = bitcast i8* %x to <8 x i16>*
-  %1 = load <8 x i16>, <8 x i16>* %0, align 8
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
   %2 = bitcast i8* %z to <8 x i16>*
-  store <8 x i16> %1, <8 x i16>* %2, align 8
+  store <8 x i16> %1, <8 x i16>* %2, align 2
   ret i8* %z
 }
 
 
-define i8* @post_strb32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_4:
+define i8* @strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vstrb.32 q0, [r0, #4]
@@ -1028,14 +1109,14 @@ define i8* @post_strb32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_3:
+define i8* @strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
@@ -1044,14 +1125,14 @@ define i8* @post_strb32_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_127:
+define i8* @strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
@@ -1060,14 +1141,14 @@ define i8* @post_strb32_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb32_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb32_128:
+define i8* @strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb32_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
@@ -1076,15 +1157,15 @@ define i8* @post_strb32_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <4 x i8>*
-  %1 = load <4 x i8>, <4 x i8>* %0, align 8
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
   %2 = bitcast i8* %z to <4 x i8>*
-  store <4 x i8> %1, <4 x i8>* %2, align 8
+  store <4 x i8> %1, <4 x i8>* %2, align 1
   ret i8* %z
 }
 
 
-define i8* @post_strb16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_4:
+define i8* @strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
 ; CHECK-NEXT:    vstrb.16 q0, [r0, #4]
@@ -1093,14 +1174,14 @@ define i8* @post_strb16_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_3:
+define i8* @strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #3
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
@@ -1109,14 +1190,14 @@ define i8* @post_strb16_3(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_127:
+define i8* @strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
@@ -1125,14 +1206,14 @@ define i8* @post_strb16_127(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb16_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb16_128:
+define i8* @strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb16_128:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vldrb.u16 q0, [r1]
@@ -1141,79 +1222,79 @@ define i8* @post_strb16_128(i8* %y, i8*
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <8 x i8>*
-  %1 = load <8 x i8>, <8 x i8>* %0, align 8
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
   %2 = bitcast i8* %z to <8 x i8>*
-  store <8 x i8> %1, <8 x i8>* %2, align 8
+  store <8 x i8> %1, <8 x i8>* %2, align 1
   ret i8* %z
 }
 
 
-define i8* @post_strb8_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_4:
+define i8* @strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_3(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_3:
+define i8* @strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_3:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
 ; CHECK-NEXT:    adds r0, #3
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 3
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_127(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_127:
+define i8* @strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_127:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adds r0, #127
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 127
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strb8_128(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strb8_128:
+define i8* @strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: strb8_128:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #128]
 ; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 128
   %0 = bitcast i8* %x to <16 x i8>*
-  %1 = load <16 x i8>, <16 x i8>* %0, align 8
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
   %2 = bitcast i8* %z to <16 x i8>*
-  store <16 x i8> %1, <16 x i8>* %2, align 8
+  store <16 x i8> %1, <16 x i8>* %2, align 1
   ret i8* %z
 }
 
-define i8* @post_strf32_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf32_4:
+define i8* @strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_4:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
@@ -1222,24 +1303,105 @@ define i8* @post_strf32_4(i8* %y, i8* %x
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <4 x float>*
-  %1 = load <4 x float>, <4 x float>* %0, align 8
+  %1 = load <4 x float>, <4 x float>* %0, align 4
   %2 = bitcast i8* %z to <4 x float>*
-  store <4 x float> %1, <4 x float>* %2, align 8
+  store <4 x float> %1, <4 x float>* %2, align 4
   ret i8* %z
 }
 
-define i8* @post_strf16_4(i8* %y, i8* %x) {
-; CHECK-LABEL: post_strf16_4:
+define i8* @strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q0, [r0, #4]
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, #4]
 ; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x half>*
-  %1 = load <8 x half>, <8 x half>* %0, align 8
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @strwi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strwi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strhi16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strhi32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strhi32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i16>*
+  %3 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %3, <4 x i16>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strf32_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf32_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @strf16_align1(i8* %y, i8* %x) {
+; CHECK-LABEL: strf16_align1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r0, #3]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
   %2 = bitcast i8* %z to <8 x half>*
-  store <8 x half> %1, <8 x half>* %2, align 8
+  store <8 x half> %1, <8 x half>* %2, align 1
   ret i8* %z
 }

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-ldst-regimm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-ldst-regimm.ll?rev=368255&r1=368254&r2=368255&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-ldst-regimm.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-ldst-regimm.ll Wed Aug  7 22:58:48 2019
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
 
 %struct.s_int8_t = type { [16 x i8], [16 x i8] }
 %struct.s_int16_t = type { [8 x i16], [8 x i16] }

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll?rev=368255&r1=368254&r2=368255&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-loadstore.ll Wed Aug  7 22:58:48 2019
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) {
 ; CHECK-LABEL: load_4xi32_a4:




More information about the llvm-commits mailing list