[llvm] r370329 - [ARM] MVE Masked loads and stores

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 29 03:54:35 PDT 2019


Author: dmgreen
Date: Thu Aug 29 03:54:35 2019
New Revision: 370329

URL: http://llvm.org/viewvc/llvm-project?rev=370329&view=rev
Log:
[ARM] MVE Masked loads and stores

Masked loads and store fit naturally with MVE, the instructions being easily
predicated. This adds lowering for the simple cases of masked loads and stores.
It does not yet deal with widening/narrowing or pre/post inc.

The llvm masked load intrinsic will accept a "passthru" value, dictating the
values used for the zero masked lanes. In MVE the instructions write 0 to the
zero predicated lanes, so we need to match a passthru that isn't 0 (or undef)
with a select instruction to pull in the correct data after the load.

We also need to do something with unaligned loads/stores. Currently this uses a
similar method used in big endian, using an VLDRB.8 (and potentially a VREV in
BE). This does mean that the predicate mask is converted from, for example, a
v4i1 to a v16i1. The VLDR instructions are defined as using the first bit of
the relevant mask lane, so this could potentially load different results if the
predicate is little odd. As the input is a v4i1 however, I believe this is OK
and all the bits required should be set in the predicate, making the VLDRB.8
load the same data.

Differential Revision: https://reviews.llvm.org/D66534

Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Aug 29 03:54:35 2019
@@ -259,6 +259,8 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::UMAX, VT, Legal);
     setOperationAction(ISD::ABS, VT, Legal);
     setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
 
     // No native support for these.
     setOperationAction(ISD::UDIV, VT, Expand);
@@ -300,6 +302,8 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
     setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -8726,6 +8730,31 @@ void ARMTargetLowering::ExpandDIV_Window
   Results.push_back(Upper);
 }
 
+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDValue PassThru = N->getPassThru();
+  SDLoc dl(Op);
+
+  if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+      (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+       isNullConstant(PassThru->getOperand(0))))
+    return Op;
+
+  // MVE Masked loads use zero as the passthru value. Here we convert undef to
+  // zero too, and other values are lowered to a select.
+  SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(0, dl, MVT::i32));
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+      N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+  SDValue Combo = NewLoad;
+  if (!PassThru.isUndef())
+    Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+  return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -8925,6 +8954,8 @@ SDValue ARMTargetLowering::LowerOperatio
   case ISD::UADDO:
   case ISD::USUBO:
     return LowerUnsignedALUO(Op, DAG);
+  case ISD::MLOAD:
+    return LowerMLOAD(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);

Modified: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrMVE.td?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td Thu Aug 29 03:54:35 2019
@@ -4810,6 +4810,10 @@ class MVE_vector_store_typed<ValueType T
                              PatFrag StoreKind, int shift>
   : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
         (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+                                   PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
 
 multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
                             int shift> {
@@ -4826,6 +4830,10 @@ class MVE_vector_load_typed<ValueType Ty
                             PatFrag LoadKind, int shift>
   : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
         (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+                                  PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
 
 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
                            int shift> {
@@ -4871,6 +4879,28 @@ def aligned16_post_store : PatFrag<(ops
   return cast<StoreSDNode>(N)->getAlignment() >= 2;
 }]>;
 
+def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                         (masked_ld node:$ptr, node:$pred, node:$passthru)>;
+
+def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                          (masked_st node:$val, node:$ptr, node:$pred)>;
+
 let Predicates = [HasMVEInt, IsLE] in {
   // Stores
   defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
@@ -4889,6 +4919,26 @@ let Predicates = [HasMVEInt, IsLE] in {
   defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
   defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
   defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
 }
 
 let Predicates = [HasMVEInt, IsBE] in {
@@ -4943,9 +4993,41 @@ let Predicates = [HasMVEInt, IsBE] in {
   def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
   def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
   def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
 }
 
 let Predicates = [HasMVEInt] in {
+  // Aligned masked store, shared between LE and BE
+  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore, 0>;
+  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  // Aligned masked loads
+  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload, 0>;
+  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+
   // Predicate loads
   def  : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
              (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h Thu Aug 29 03:54:35 2019
@@ -106,6 +106,20 @@ public:
     return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
   }
 
+  bool isLegalMaskedLoad(Type *DataTy) {
+    if (!ST->hasMVEIntegerOps())
+      return false;
+
+    unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+    if (VecWidth != 128)
+      return false;
+
+    unsigned EltWidth = DataTy->getScalarSizeInBits();
+    return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
+  }
+
+  bool isLegalMaskedStore(Type *DataTy) { return isLegalMaskedLoad(DataTy); }
+
   /// \name Scalar TTI Implementations
   /// @{
 

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll Thu Aug 29 03:54:35 2019
@@ -5,50 +5,11 @@
 define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_v4i32_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -61,14 +22,14 @@ entry:
 define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
 ; CHECK-LABEL: foo_sext_v4i32_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -85,28 +46,11 @@ define void @foo_sext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -120,14 +64,14 @@ entry:
 define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
 ; CHECK-LABEL: foo_sext_v4i32_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
@@ -144,27 +88,10 @@ define void @foo_sext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -178,15 +105,15 @@ entry:
 define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
 ; CHECK-LABEL: foo_zext_v4i32_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -203,27 +130,10 @@ define void @foo_zext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -237,14 +147,14 @@ entry:
 define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
 ; CHECK-LABEL: foo_zext_v4i32_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
@@ -261,27 +171,10 @@ define void @foo_zext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -295,82 +188,11 @@ entry:
 define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
 ; CHECK-LABEL: foo_v8i16_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrhne r3, [r2]
-; CHECK-NEXT:    vmovne.16 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r1, [r2, #14]
-; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
+; CHECK-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -383,14 +205,14 @@ entry:
 define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
 ; CHECK-LABEL: foo_sext_v8i16_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -423,43 +245,10 @@ define void @foo_sext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -473,14 +262,14 @@ entry:
 define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
 ; CHECK-LABEL: foo_zext_v8i16_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
+; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -513,43 +302,10 @@ define void @foo_zext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -563,155 +319,12 @@ entry:
 define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src) {
 ; CHECK-LABEL: foo_v16i8_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r6, r7, lr}
-; CHECK-NEXT:    .setfp r7, sp, #8
-; CHECK-NEXT:    add r7, sp, #8
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    bfc r4, #0, #4
-; CHECK-NEXT:    mov sp, r4
 ; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #16
-; CHECK-NEXT:    sub.w r4, r7, #8
 ; CHECK-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrh.w r1, [sp, #16]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrbne r3, [r2]
-; CHECK-NEXT:    vmovne.8 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #1]
-; CHECK-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #3]
-; CHECK-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #5]
-; CHECK-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-NEXT:    lsls r3, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #7]
-; CHECK-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-NEXT:    lsls r3, r1, #23
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-NEXT:    lsls r3, r1, #22
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #9]
-; CHECK-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-NEXT:    lsls r3, r1, #21
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-NEXT:    lsls r3, r1, #20
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #11]
-; CHECK-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-NEXT:    lsls r3, r1, #19
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-NEXT:    lsls r3, r1, #18
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #13]
-; CHECK-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-NEXT:    lsls r3, r1, #17
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #14]
-; CHECK-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-NEXT:    lsls r1, r1, #16
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r1, [r2, #15]
-; CHECK-NEXT:    vmovmi.8 q0[15], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrh.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-NEXT:    strbne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-NEXT:    strbmi r2, [r0, #1]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-NEXT:    strbmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-NEXT:    strbmi r2, [r0, #3]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-NEXT:    strbmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-NEXT:    strbmi r2, [r0, #5]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-NEXT:    strbmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-NEXT:    strbmi r2, [r0, #7]
-; CHECK-NEXT:    lsls r2, r1, #23
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-NEXT:    strbmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #22
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-NEXT:    strbmi r2, [r0, #9]
-; CHECK-NEXT:    lsls r2, r1, #21
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-NEXT:    strbmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #20
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-NEXT:    strbmi r2, [r0, #11]
-; CHECK-NEXT:    lsls r2, r1, #19
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-NEXT:    strbmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r2, r1, #18
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-NEXT:    strbmi r2, [r0, #13]
-; CHECK-NEXT:    lsls r2, r1, #17
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-NEXT:    strbmi r2, [r0, #14]
-; CHECK-NEXT:    lsls r1, r1, #16
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-NEXT:    strbmi r1, [r0, #15]
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u8 q0, [r2]
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = load <16 x i8>, <16 x i8>* %mask, align 1
   %1 = icmp sgt <16 x i8> %0, zeroinitializer
@@ -723,48 +336,14 @@ entry:
 define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
 ; CHECK-LABEL: foo_trunc_v8i8_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrhne r3, [r2]
-; CHECK-NEXT:    vmovne.16 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r1, [r2, #14]
-; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
 ; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
@@ -798,7 +377,7 @@ define void @foo_trunc_v8i8_v8i16(<8 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
 ; CHECK-NEXT:    strbmi r1, [r0, #7]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -812,32 +391,14 @@ entry:
 define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_trunc_v4i8_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
 ; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
@@ -855,7 +416,7 @@ define void @foo_trunc_v4i8_v4i32(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi r1, s3
 ; CHECK-NEXT:    strbmi r1, [r0, #3]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -869,32 +430,14 @@ entry:
 define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_trunc_v4i16_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
 ; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
 ; CHECK-NEXT:    ldrb.w r1, [sp]
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
@@ -912,7 +455,7 @@ define void @foo_trunc_v4i16_v4i32(<4 x
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi r1, s3
 ; CHECK-NEXT:    strhmi r1, [r0, #6]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -926,42 +469,11 @@ entry:
 define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *%src) {
 ; CHECK-LABEL: foo_v4f32_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    vldrne s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s1, [r2, #4]
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s2, [r2, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s3, [r2, #12]
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    vstrne s0, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -974,147 +486,11 @@ entry:
 define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%src) {
 ; CHECK-LABEL: foo_v8f16_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    bne .LBB13_18
-; CHECK-NEXT:  @ %bb.1: @ %else
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bmi .LBB13_19
-; CHECK-NEXT:  .LBB13_2: @ %else2
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bmi .LBB13_20
-; CHECK-NEXT:  .LBB13_3: @ %else5
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    bmi .LBB13_21
-; CHECK-NEXT:  .LBB13_4: @ %else8
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    bmi .LBB13_22
-; CHECK-NEXT:  .LBB13_5: @ %else11
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    bmi .LBB13_23
-; CHECK-NEXT:  .LBB13_6: @ %else14
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    bmi .LBB13_24
-; CHECK-NEXT:  .LBB13_7: @ %else17
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bpl .LBB13_9
-; CHECK-NEXT:  .LBB13_8: @ %cond.load19
-; CHECK-NEXT:    vldr.16 s4, [r2, #14]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:  .LBB13_9: @ %else20
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    bne .LBB13_25
-; CHECK-NEXT:  @ %bb.10: @ %else23
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    bmi .LBB13_26
-; CHECK-NEXT:  .LBB13_11: @ %else25
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    bmi .LBB13_27
-; CHECK-NEXT:  .LBB13_12: @ %else27
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    bmi .LBB13_28
-; CHECK-NEXT:  .LBB13_13: @ %else29
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    bmi .LBB13_29
-; CHECK-NEXT:  .LBB13_14: @ %else31
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    bmi .LBB13_30
-; CHECK-NEXT:  .LBB13_15: @ %else33
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    bmi .LBB13_31
-; CHECK-NEXT:  .LBB13_16: @ %else35
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bmi .LBB13_32
-; CHECK-NEXT:  .LBB13_17: @ %else37
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB13_18: @ %cond.load
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bpl .LBB13_2
-; CHECK-NEXT:  .LBB13_19: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bpl .LBB13_3
-; CHECK-NEXT:  .LBB13_20: @ %cond.load4
-; CHECK-NEXT:    vldr.16 s4, [r2, #4]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    bpl .LBB13_4
-; CHECK-NEXT:  .LBB13_21: @ %cond.load7
-; CHECK-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    bpl .LBB13_5
-; CHECK-NEXT:  .LBB13_22: @ %cond.load10
-; CHECK-NEXT:    vldr.16 s4, [r2, #8]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    bpl .LBB13_6
-; CHECK-NEXT:  .LBB13_23: @ %cond.load13
-; CHECK-NEXT:    vldr.16 s4, [r2, #10]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    bpl .LBB13_7
-; CHECK-NEXT:  .LBB13_24: @ %cond.load16
-; CHECK-NEXT:    vldr.16 s4, [r2, #12]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bmi .LBB13_8
-; CHECK-NEXT:    b .LBB13_9
-; CHECK-NEXT:  .LBB13_25: @ %cond.store
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    bpl .LBB13_11
-; CHECK-NEXT:  .LBB13_26: @ %cond.store24
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    bpl .LBB13_12
-; CHECK-NEXT:  .LBB13_27: @ %cond.store26
-; CHECK-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    bpl .LBB13_13
-; CHECK-NEXT:  .LBB13_28: @ %cond.store28
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    bpl .LBB13_14
-; CHECK-NEXT:  .LBB13_29: @ %cond.store30
-; CHECK-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    bpl .LBB13_15
-; CHECK-NEXT:  .LBB13_30: @ %cond.store32
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    bpl .LBB13_16
-; CHECK-NEXT:  .LBB13_31: @ %cond.store34
-; CHECK-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bpl .LBB13_17
-; CHECK-NEXT:  .LBB13_32: @ %cond.store36
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
+; CHECK-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll Thu Aug 29 03:54:35 2019
@@ -5,72 +5,18 @@
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_zero(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB0_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldr r3, [r0]
-; CHECK-LE-NEXT:    vdup.32 q0, r2
-; CHECK-LE-NEXT:    vmov.32 q0[0], r3
-; CHECK-LE-NEXT:    b .LBB0_3
-; CHECK-LE-NEXT:  .LBB0_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB0_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB0_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldr r3, [r0]
-; CHECK-BE-NEXT:    vdup.32 q1, r2
-; CHECK-BE-NEXT:    vmov.32 q1[0], r3
-; CHECK-BE-NEXT:    b .LBB0_3
-; CHECK-BE-NEXT:  .LBB0_2:
-; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:  .LBB0_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -81,60 +27,18 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_undef(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -145,60 +49,19 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev32.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -209,58 +72,20 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_other(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -271,63 +96,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r2
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q0[3], r2
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -342,65 +125,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q0[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -417,105 +156,23 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB6_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldrh r3, [r0]
-; CHECK-LE-NEXT:    vdup.16 q0, r2
-; CHECK-LE-NEXT:    vmov.16 q0[0], r3
-; CHECK-LE-NEXT:    b .LBB6_3
-; CHECK-LE-NEXT:  .LBB6_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB6_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB6_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldrh r3, [r0]
-; CHECK-BE-NEXT:    vdup.16 q1, r2
-; CHECK-BE-NEXT:    vmov.16 q1[0], r3
-; CHECK-BE-NEXT:    b .LBB6_3
-; CHECK-BE-NEXT:  .LBB6_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.16 q1, q0
-; CHECK-BE-NEXT:  .LBB6_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vrev32.16 q1, q1
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -526,92 +183,18 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -622,92 +205,19 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev16.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -718,90 +228,20 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_other(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -812,99 +252,25 @@ entry:
 define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r2
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q0[7], r2
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -919,97 +285,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q0[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -1025,186 +315,24 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB12_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldrb r3, [r0]
-; CHECK-LE-NEXT:    vdup.8 q0, r2
-; CHECK-LE-NEXT:    vmov.8 q0[0], r3
-; CHECK-LE-NEXT:    b .LBB12_3
-; CHECK-LE-NEXT:  .LBB12_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB12_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB12_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldrb r3, [r0]
-; CHECK-BE-NEXT:    vdup.8 q1, r2
-; CHECK-BE-NEXT:    vmov.8 q1[0], r3
-; CHECK-BE-NEXT:    b .LBB12_3
-; CHECK-BE-NEXT:  .LBB12_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.8 q1, q0
-; CHECK-BE-NEXT:  .LBB12_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vrev32.8 q1, q1
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> zeroinitializer)
@@ -1214,173 +342,19 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_undef(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> undef)
@@ -1390,171 +364,21 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_other(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> %a)
@@ -1564,176 +388,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r2
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0, #4]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q0[15], r2
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0, #4]
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <16 x i8>*
@@ -1747,178 +417,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r3, [sp]
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    ldrh.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q0[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -1933,77 +447,23 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB17_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    vldr s0, .LCPI17_0
-; CHECK-LE-NEXT:    vldr s4, [r0]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vdup.32 q0, r2
-; CHECK-LE-NEXT:    vmov.f32 s0, s4
-; CHECK-LE-NEXT:    b .LBB17_3
-; CHECK-LE-NEXT:  .LBB17_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB17_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:    .p2align 2
-; CHECK-LE-NEXT:  @ %bb.4:
-; CHECK-LE-NEXT:  .LCPI17_0:
-; CHECK-LE-NEXT:    .long 0 @ float 0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB17_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    vldr s0, .LCPI17_0
-; CHECK-BE-NEXT:    vldr s2, [r0]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vdup.32 q1, r2
-; CHECK-BE-NEXT:    vmov.f32 s4, s2
-; CHECK-BE-NEXT:    b .LBB17_3
-; CHECK-BE-NEXT:  .LBB17_2:
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:  .LBB17_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s7, [r0, #12]
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:    .p2align 2
-; CHECK-BE-NEXT:  @ %bb.4:
-; CHECK-BE-NEXT:  .LCPI17_0:
-; CHECK-BE-NEXT:    .long 0 @ float 0
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
   %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 4, <4 x i1> %c, <4 x float> zeroinitializer)
@@ -2013,52 +473,18 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_undef(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s7, [r0, #12]
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2069,60 +495,19 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne s0, r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi s1, r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi s2, r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi s3, r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne s4, r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi s5, r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi s6, r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi s7, r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev32.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2133,52 +518,21 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_other(<4 x float> *%dest, <4 x i32> %a, <4 x float> %b) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s4, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s7, [r0, #12]
-; CHECK-LE-NEXT:    vmov q0, q1
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q2, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s8, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s9, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s10, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s11, [r0, #12]
-; CHECK-BE-NEXT:    vrev64.32 q0, q2
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q2
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2189,55 +543,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -2252,57 +572,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -2318,185 +602,24 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB23_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    vmov r3, s0
-; CHECK-LE-NEXT:    vdup.16 q0, r2
-; CHECK-LE-NEXT:    vmov.16 q0[0], r3
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB23_3
-; CHECK-LE-NEXT:    b .LBB23_4
-; CHECK-LE-NEXT:  .LBB23_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB23_4
-; CHECK-LE-NEXT:  .LBB23_3: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:  .LBB23_4: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB23_11
-; CHECK-LE-NEXT:  @ %bb.5: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB23_12
-; CHECK-LE-NEXT:  .LBB23_6: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB23_13
-; CHECK-LE-NEXT:  .LBB23_7: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB23_14
-; CHECK-LE-NEXT:  .LBB23_8: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB23_15
-; CHECK-LE-NEXT:  .LBB23_9: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB23_16
-; CHECK-LE-NEXT:  .LBB23_10: @ %else20
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB23_11: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB23_6
-; CHECK-LE-NEXT:  .LBB23_12: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB23_7
-; CHECK-LE-NEXT:  .LBB23_13: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB23_8
-; CHECK-LE-NEXT:  .LBB23_14: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB23_9
-; CHECK-LE-NEXT:  .LBB23_15: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB23_10
-; CHECK-LE-NEXT:  .LBB23_16: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:    .p2align 1
-; CHECK-LE-NEXT:  @ %bb.17:
-; CHECK-LE-NEXT:  .LCPI23_0:
-; CHECK-LE-NEXT:    .short 0 @ half 0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB23_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    vdup.16 q1, r2
-; CHECK-BE-NEXT:    vmov r3, s0
-; CHECK-BE-NEXT:    vmov.16 q1[0], r3
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB23_3
-; CHECK-BE-NEXT:    b .LBB23_4
-; CHECK-BE-NEXT:  .LBB23_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.16 q1, q0
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB23_4
-; CHECK-BE-NEXT:  .LBB23_3: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:  .LBB23_4: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB23_12
-; CHECK-BE-NEXT:  @ %bb.5: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB23_13
-; CHECK-BE-NEXT:  .LBB23_6: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB23_14
-; CHECK-BE-NEXT:  .LBB23_7: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB23_15
-; CHECK-BE-NEXT:  .LBB23_8: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB23_16
-; CHECK-BE-NEXT:  .LBB23_9: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB23_11
-; CHECK-BE-NEXT:  .LBB23_10: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB23_11: @ %else20
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vrev32.16 q1, q1
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB23_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB23_6
-; CHECK-BE-NEXT:  .LBB23_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB23_7
-; CHECK-BE-NEXT:  .LBB23_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB23_8
-; CHECK-BE-NEXT:  .LBB23_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB23_9
-; CHECK-BE-NEXT:  .LBB23_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB23_10
-; CHECK-BE-NEXT:    b .LBB23_11
-; CHECK-BE-NEXT:    .p2align 1
-; CHECK-BE-NEXT:  @ %bb.17:
-; CHECK-BE-NEXT:  .LCPI23_0:
-; CHECK-BE-NEXT:    .short 0 @ half 0
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> zeroinitializer)
@@ -2506,168 +629,19 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_undef(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB24_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB24_10
-; CHECK-LE-NEXT:  .LBB24_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB24_11
-; CHECK-LE-NEXT:  .LBB24_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB24_12
-; CHECK-LE-NEXT:  .LBB24_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB24_13
-; CHECK-LE-NEXT:  .LBB24_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB24_14
-; CHECK-LE-NEXT:  .LBB24_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB24_15
-; CHECK-LE-NEXT:  .LBB24_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB24_16
-; CHECK-LE-NEXT:  .LBB24_8: @ %else20
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB24_9: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB24_2
-; CHECK-LE-NEXT:  .LBB24_10: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB24_3
-; CHECK-LE-NEXT:  .LBB24_11: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB24_4
-; CHECK-LE-NEXT:  .LBB24_12: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB24_5
-; CHECK-LE-NEXT:  .LBB24_13: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB24_6
-; CHECK-LE-NEXT:  .LBB24_14: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB24_7
-; CHECK-LE-NEXT:  .LBB24_15: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB24_8
-; CHECK-LE-NEXT:  .LBB24_16: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB24_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB24_11
-; CHECK-BE-NEXT:  .LBB24_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB24_12
-; CHECK-BE-NEXT:  .LBB24_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB24_13
-; CHECK-BE-NEXT:  .LBB24_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB24_14
-; CHECK-BE-NEXT:  .LBB24_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB24_15
-; CHECK-BE-NEXT:  .LBB24_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB24_16
-; CHECK-BE-NEXT:  .LBB24_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB24_9
-; CHECK-BE-NEXT:  .LBB24_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB24_9: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB24_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB24_2
-; CHECK-BE-NEXT:  .LBB24_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB24_3
-; CHECK-BE-NEXT:  .LBB24_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB24_4
-; CHECK-BE-NEXT:  .LBB24_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB24_5
-; CHECK-BE-NEXT:  .LBB24_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB24_6
-; CHECK-BE-NEXT:  .LBB24_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB24_7
-; CHECK-BE-NEXT:  .LBB24_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB24_8
-; CHECK-BE-NEXT:    b .LBB24_9
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> undef)
@@ -2677,200 +651,20 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #40
-; CHECK-LE-NEXT:    sub sp, #40
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    add r1, sp, #32
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB25_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB25_10
-; CHECK-LE-NEXT:  .LBB25_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB25_11
-; CHECK-LE-NEXT:  .LBB25_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB25_12
-; CHECK-LE-NEXT:  .LBB25_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB25_13
-; CHECK-LE-NEXT:  .LBB25_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB25_14
-; CHECK-LE-NEXT:  .LBB25_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB25_15
-; CHECK-LE-NEXT:  .LBB25_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB25_16
-; CHECK-LE-NEXT:  .LBB25_8: @ %else20
-; CHECK-LE-NEXT:    add sp, #40
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB25_9: @ %cond.load
-; CHECK-LE-NEXT:    ldrh r2, [r0]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #28]
-; CHECK-LE-NEXT:    vldr.16 s0, [sp, #28]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB25_2
-; CHECK-LE-NEXT:  .LBB25_10: @ %cond.load1
-; CHECK-LE-NEXT:    ldrh r2, [r0, #2]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #24]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #24]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB25_3
-; CHECK-LE-NEXT:  .LBB25_11: @ %cond.load4
-; CHECK-LE-NEXT:    ldrh r2, [r0, #4]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #20]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #20]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB25_4
-; CHECK-LE-NEXT:  .LBB25_12: @ %cond.load7
-; CHECK-LE-NEXT:    ldrh r2, [r0, #6]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #16]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #16]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB25_5
-; CHECK-LE-NEXT:  .LBB25_13: @ %cond.load10
-; CHECK-LE-NEXT:    ldrh r2, [r0, #8]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #12]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB25_6
-; CHECK-LE-NEXT:  .LBB25_14: @ %cond.load13
-; CHECK-LE-NEXT:    ldrh r2, [r0, #10]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #8]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB25_7
-; CHECK-LE-NEXT:  .LBB25_15: @ %cond.load16
-; CHECK-LE-NEXT:    ldrh r2, [r0, #12]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB25_8
-; CHECK-LE-NEXT:  .LBB25_16: @ %cond.load19
-; CHECK-LE-NEXT:    ldrh r0, [r0, #14]
-; CHECK-LE-NEXT:    strh.w r0, [sp]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #40
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #40
-; CHECK-BE-NEXT:    sub sp, #40
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    add r1, sp, #32
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB25_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB25_11
-; CHECK-BE-NEXT:  .LBB25_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB25_12
-; CHECK-BE-NEXT:  .LBB25_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB25_13
-; CHECK-BE-NEXT:  .LBB25_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB25_14
-; CHECK-BE-NEXT:  .LBB25_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB25_15
-; CHECK-BE-NEXT:  .LBB25_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB25_16
-; CHECK-BE-NEXT:  .LBB25_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB25_9
-; CHECK-BE-NEXT:  .LBB25_8: @ %cond.load19
-; CHECK-BE-NEXT:    ldrh r0, [r0, #14]
-; CHECK-BE-NEXT:    strh.w r0, [sp]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB25_9: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev16.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #40
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB25_10: @ %cond.load
-; CHECK-BE-NEXT:    ldrh r2, [r0]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #28]
-; CHECK-BE-NEXT:    vldr.16 s4, [sp, #28]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB25_2
-; CHECK-BE-NEXT:  .LBB25_11: @ %cond.load1
-; CHECK-BE-NEXT:    ldrh r2, [r0, #2]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #24]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #24]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB25_3
-; CHECK-BE-NEXT:  .LBB25_12: @ %cond.load4
-; CHECK-BE-NEXT:    ldrh r2, [r0, #4]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #20]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #20]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB25_4
-; CHECK-BE-NEXT:  .LBB25_13: @ %cond.load7
-; CHECK-BE-NEXT:    ldrh r2, [r0, #6]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #16]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #16]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB25_5
-; CHECK-BE-NEXT:  .LBB25_14: @ %cond.load10
-; CHECK-BE-NEXT:    ldrh r2, [r0, #8]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #12]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB25_6
-; CHECK-BE-NEXT:  .LBB25_15: @ %cond.load13
-; CHECK-BE-NEXT:    ldrh r2, [r0, #10]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #8]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB25_7
-; CHECK-BE-NEXT:  .LBB25_16: @ %cond.load16
-; CHECK-BE-NEXT:    ldrh r2, [r0, #12]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB25_8
-; CHECK-BE-NEXT:    b .LBB25_9
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 1, <8 x i1> %c, <8 x half> undef)
@@ -2880,171 +674,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_other(<8 x half> *%dest, <8 x i16> %a, <8 x half> %b) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB26_10
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB26_11
-; CHECK-LE-NEXT:  .LBB26_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB26_12
-; CHECK-LE-NEXT:  .LBB26_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB26_13
-; CHECK-LE-NEXT:  .LBB26_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB26_14
-; CHECK-LE-NEXT:  .LBB26_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB26_15
-; CHECK-LE-NEXT:  .LBB26_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB26_16
-; CHECK-LE-NEXT:  .LBB26_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB26_9
-; CHECK-LE-NEXT:  .LBB26_8: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s0
-; CHECK-LE-NEXT:    vmov.16 q1[7], r0
-; CHECK-LE-NEXT:  .LBB26_9: @ %else20
-; CHECK-LE-NEXT:    vmov q0, q1
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB26_10: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB26_2
-; CHECK-LE-NEXT:  .LBB26_11: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB26_3
-; CHECK-LE-NEXT:  .LBB26_12: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB26_4
-; CHECK-LE-NEXT:  .LBB26_13: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB26_5
-; CHECK-LE-NEXT:  .LBB26_14: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB26_6
-; CHECK-LE-NEXT:  .LBB26_15: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB26_7
-; CHECK-LE-NEXT:  .LBB26_16: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB26_8
-; CHECK-LE-NEXT:    b .LBB26_9
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q2, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB26_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB26_11
-; CHECK-BE-NEXT:  .LBB26_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB26_12
-; CHECK-BE-NEXT:  .LBB26_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB26_13
-; CHECK-BE-NEXT:  .LBB26_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB26_14
-; CHECK-BE-NEXT:  .LBB26_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB26_15
-; CHECK-BE-NEXT:  .LBB26_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB26_16
-; CHECK-BE-NEXT:  .LBB26_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB26_9
-; CHECK-BE-NEXT:  .LBB26_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q2[7], r0
-; CHECK-BE-NEXT:  .LBB26_9: @ %else20
-; CHECK-BE-NEXT:    vrev64.16 q0, q2
-; CHECK-BE-NEXT:    add sp, #8
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB26_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB26_2
-; CHECK-BE-NEXT:  .LBB26_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB26_3
-; CHECK-BE-NEXT:  .LBB26_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB26_4
-; CHECK-BE-NEXT:  .LBB26_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB26_5
-; CHECK-BE-NEXT:  .LBB26_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB26_6
-; CHECK-BE-NEXT:  .LBB26_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB26_7
-; CHECK-BE-NEXT:  .LBB26_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB26_8
-; CHECK-BE-NEXT:    b .LBB26_9
+; CHECK-BE-NEXT:    vrev64.16 q1, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q2
+; CHECK-BE-NEXT:    vrev64.16 q0, q1
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> %b)
@@ -3054,170 +699,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    bne .LBB27_10
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    bmi .LBB27_11
-; CHECK-LE-NEXT:  .LBB27_2: @ %else2
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    bmi .LBB27_12
-; CHECK-LE-NEXT:  .LBB27_3: @ %else5
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    bmi .LBB27_13
-; CHECK-LE-NEXT:  .LBB27_4: @ %else8
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    bmi .LBB27_14
-; CHECK-LE-NEXT:  .LBB27_5: @ %else11
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    bmi .LBB27_15
-; CHECK-LE-NEXT:  .LBB27_6: @ %else14
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bmi .LBB27_16
-; CHECK-LE-NEXT:  .LBB27_7: @ %else17
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bpl .LBB27_9
-; CHECK-LE-NEXT:  .LBB27_8: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r2
-; CHECK-LE-NEXT:  .LBB27_9: @ %else20
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB27_10: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    bpl .LBB27_2
-; CHECK-LE-NEXT:  .LBB27_11: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    bpl .LBB27_3
-; CHECK-LE-NEXT:  .LBB27_12: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    bpl .LBB27_4
-; CHECK-LE-NEXT:  .LBB27_13: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    bpl .LBB27_5
-; CHECK-LE-NEXT:  .LBB27_14: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    bpl .LBB27_6
-; CHECK-LE-NEXT:  .LBB27_15: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bpl .LBB27_7
-; CHECK-LE-NEXT:  .LBB27_16: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bmi .LBB27_8
-; CHECK-LE-NEXT:    b .LBB27_9
 ;
 ; CHECK-BE-LABEL: masked_v8f16_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    bne .LBB27_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    bmi .LBB27_11
-; CHECK-BE-NEXT:  .LBB27_2: @ %else2
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    bmi .LBB27_12
-; CHECK-BE-NEXT:  .LBB27_3: @ %else5
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    bmi .LBB27_13
-; CHECK-BE-NEXT:  .LBB27_4: @ %else8
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    bmi .LBB27_14
-; CHECK-BE-NEXT:  .LBB27_5: @ %else11
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    bmi .LBB27_15
-; CHECK-BE-NEXT:  .LBB27_6: @ %else14
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bmi .LBB27_16
-; CHECK-BE-NEXT:  .LBB27_7: @ %else17
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bpl .LBB27_9
-; CHECK-BE-NEXT:  .LBB27_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[7], r2
-; CHECK-BE-NEXT:  .LBB27_9: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB27_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    bpl .LBB27_2
-; CHECK-BE-NEXT:  .LBB27_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    bpl .LBB27_3
-; CHECK-BE-NEXT:  .LBB27_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    bpl .LBB27_4
-; CHECK-BE-NEXT:  .LBB27_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    bpl .LBB27_5
-; CHECK-BE-NEXT:  .LBB27_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    bpl .LBB27_6
-; CHECK-BE-NEXT:  .LBB27_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bpl .LBB27_7
-; CHECK-BE-NEXT:  .LBB27_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[6], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bmi .LBB27_8
-; CHECK-BE-NEXT:    b .LBB27_9
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x half>*
@@ -3231,164 +728,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    bne .LBB28_12
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    bmi .LBB28_13
-; CHECK-LE-NEXT:  .LBB28_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    bmi .LBB28_14
-; CHECK-LE-NEXT:  .LBB28_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    bmi .LBB28_15
-; CHECK-LE-NEXT:  .LBB28_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    bmi .LBB28_16
-; CHECK-LE-NEXT:  .LBB28_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    bpl .LBB28_7
-; CHECK-LE-NEXT:  .LBB28_6: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:  .LBB28_7: @ %else14
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    bpl .LBB28_9
-; CHECK-LE-NEXT:  @ %bb.8: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:  .LBB28_9: @ %else17
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    bpl .LBB28_11
-; CHECK-LE-NEXT:  @ %bb.10: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:  .LBB28_11: @ %else20
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB28_12: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    bpl .LBB28_2
-; CHECK-LE-NEXT:  .LBB28_13: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    bpl .LBB28_3
-; CHECK-LE-NEXT:  .LBB28_14: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    bpl .LBB28_4
-; CHECK-LE-NEXT:  .LBB28_15: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    bpl .LBB28_5
-; CHECK-LE-NEXT:  .LBB28_16: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    bmi .LBB28_6
-; CHECK-LE-NEXT:    b .LBB28_7
 ;
 ; CHECK-BE-LABEL: masked_v8f16_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    bne .LBB28_12
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    bmi .LBB28_13
-; CHECK-BE-NEXT:  .LBB28_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    bmi .LBB28_14
-; CHECK-BE-NEXT:  .LBB28_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    bmi .LBB28_15
-; CHECK-BE-NEXT:  .LBB28_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    bmi .LBB28_16
-; CHECK-BE-NEXT:  .LBB28_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    bpl .LBB28_7
-; CHECK-BE-NEXT:  .LBB28_6: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[5], r2
-; CHECK-BE-NEXT:  .LBB28_7: @ %else14
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    bpl .LBB28_9
-; CHECK-BE-NEXT:  @ %bb.8: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[6], r2
-; CHECK-BE-NEXT:  .LBB28_9: @ %else17
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    bpl .LBB28_11
-; CHECK-BE-NEXT:  @ %bb.10: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s4
-; CHECK-BE-NEXT:    vmov.16 q0[7], r0
-; CHECK-BE-NEXT:  .LBB28_11: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB28_12: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    bpl .LBB28_2
-; CHECK-BE-NEXT:  .LBB28_13: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    bpl .LBB28_3
-; CHECK-BE-NEXT:  .LBB28_14: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    bpl .LBB28_4
-; CHECK-BE-NEXT:  .LBB28_15: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    bpl .LBB28_5
-; CHECK-BE-NEXT:  .LBB28_16: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    bmi .LBB28_6
-; CHECK-BE-NEXT:    b .LBB28_7
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x half>*

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll?rev=370329&r1=370328&r2=370329&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll Thu Aug 29 03:54:35 2019
@@ -5,57 +5,17 @@
 define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s0
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s1
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s2
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r2, s4
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s7
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -66,57 +26,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s0
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s1
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s2
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    vrev32.8 q0, q1
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r2, s4
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s7
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -127,65 +48,25 @@ entry:
 define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r1, s0
-; CHECK-LE-NEXT:    strne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s1
-; CHECK-LE-NEXT:    strmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s2
-; CHECK-LE-NEXT:    strmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s0
-; CHECK-BE-NEXT:    strne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s1
-; CHECK-BE-NEXT:    strmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s2
-; CHECK-BE-NEXT:    strmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s3
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -200,67 +81,25 @@ entry:
 define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r1, s0
-; CHECK-LE-NEXT:    strne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s1
-; CHECK-LE-NEXT:    strmi r1, [r0, #4]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r3, s2
-; CHECK-LE-NEXT:    strmi r3, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s3
-; CHECK-LE-NEXT:    strmi r2, [r0, #12]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s0
-; CHECK-BE-NEXT:    strne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s1
-; CHECK-BE-NEXT:    strmi r1, [r0, #4]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r3, s2
-; CHECK-BE-NEXT:    strmi r3, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s3
-; CHECK-BE-NEXT:    strmi r2, [r0, #12]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -276,89 +115,17 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-LE-NEXT:    strhne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
-; CHECK-BE-NEXT:    strhne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[1]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[2]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[3]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[4]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[5]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[6]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q1[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -369,89 +136,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-LE-NEXT:    strhne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    vrev16.8 q0, q1
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
-; CHECK-BE-NEXT:    strhne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[1]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[2]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[3]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[4]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[5]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[6]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q1[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -462,97 +158,25 @@ entry:
 define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-LE-NEXT:    strhne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[6]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-BE-NEXT:    strhne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[6]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -567,99 +191,25 @@ entry:
 define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-LE-NEXT:    strhne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r3, q0[6]
-; CHECK-LE-NEXT:    strhmi r3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[7]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #14]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-BE-NEXT:    strhne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r3, q0[6]
-; CHECK-BE-NEXT:    strhmi r3, [r0, #12]
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[7]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #14]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -675,170 +225,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-LE-NEXT:    strbne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r2, q1[0]
-; CHECK-BE-NEXT:    strbne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[1]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[2]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[3]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[4]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[5]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[6]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[7]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[8]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[9]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[10]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[11]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[12]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[13]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[14]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q1[15]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %a, <16 x i8>* %dest, i32 1, <16 x i1> %c)
@@ -848,178 +246,26 @@ entry:
 define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    vldr d1, [r7, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-LE-NEXT:    strbne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-LE-NEXT:    lsls r1, r2, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-LE-NEXT:    lsls r1, r2, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-LE-NEXT:    lsls r1, r2, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-LE-NEXT:    lsls r1, r2, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[14]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #14]
-; CHECK-LE-NEXT:    lsls r1, r2, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vldr d1, [r7, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-BE-NEXT:    strbne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-BE-NEXT:    lsls r1, r2, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-BE-NEXT:    lsls r1, r2, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-BE-NEXT:    lsls r1, r2, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-BE-NEXT:    lsls r1, r2, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[14]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #14]
-; CHECK-BE-NEXT:    lsls r1, r2, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -1033,180 +279,26 @@ entry:
 define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    vldr d1, [r7, #8]
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-LE-NEXT:    strbne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-LE-NEXT:    lsls r1, r2, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-LE-NEXT:    lsls r1, r2, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-LE-NEXT:    lsls r1, r2, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r3, q0[14]
-; CHECK-LE-NEXT:    strbmi r3, [r0, #14]
-; CHECK-LE-NEXT:    lsls r2, r2, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[15]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #15]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vldr d1, [r7, #8]
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-BE-NEXT:    strbne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-BE-NEXT:    lsls r1, r2, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-BE-NEXT:    lsls r1, r2, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-BE-NEXT:    lsls r1, r2, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r3, q0[14]
-; CHECK-BE-NEXT:    strbmi r3, [r0, #14]
-; CHECK-BE-NEXT:    lsls r2, r2, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[15]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #15]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -1221,50 +313,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
 ; CHECK-LE-LABEL: masked_v4f32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s7, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -1275,66 +335,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
 ; CHECK-LE-LABEL: masked_v4f32_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #20
-; CHECK-LE-NEXT:    sub sp, #20
-; CHECK-LE-NEXT:    add r1, sp, #16
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #16]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    ittt ne
-; CHECK-LE-NEXT:    vstrne s0, [sp, #12]
-; CHECK-LE-NEXT:    ldrne r2, [sp, #12]
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s1, [sp, #8]
-; CHECK-LE-NEXT:    ldrmi r2, [sp, #8]
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s2, [sp, #4]
-; CHECK-LE-NEXT:    ldrmi r2, [sp, #4]
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s3, [sp]
-; CHECK-LE-NEXT:    ldrmi r1, [sp]
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #20
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #20
-; CHECK-BE-NEXT:    sub sp, #20
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    add r1, sp, #16
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #16]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    ittt ne
-; CHECK-BE-NEXT:    vstrne s4, [sp, #12]
-; CHECK-BE-NEXT:    ldrne r2, [sp, #12]
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s5, [sp, #8]
-; CHECK-BE-NEXT:    ldrmi r2, [sp, #8]
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s6, [sp, #4]
-; CHECK-BE-NEXT:    ldrmi r2, [sp, #4]
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s7, [sp]
-; CHECK-BE-NEXT:    ldrmi r1, [sp]
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #20
+; CHECK-BE-NEXT:    vrev32.8 q0, q1
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -1345,57 +358,25 @@ entry:
 define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1410,59 +391,25 @@ entry:
 define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1478,146 +425,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
 ; CHECK-LE-LABEL: masked_v8f16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB15_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB15_10
-; CHECK-LE-NEXT:  .LBB15_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB15_11
-; CHECK-LE-NEXT:  .LBB15_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB15_12
-; CHECK-LE-NEXT:  .LBB15_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB15_13
-; CHECK-LE-NEXT:  .LBB15_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB15_14
-; CHECK-LE-NEXT:  .LBB15_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB15_15
-; CHECK-LE-NEXT:  .LBB15_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB15_16
-; CHECK-LE-NEXT:  .LBB15_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB15_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB15_2
-; CHECK-LE-NEXT:  .LBB15_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB15_3
-; CHECK-LE-NEXT:  .LBB15_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB15_4
-; CHECK-LE-NEXT:  .LBB15_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB15_5
-; CHECK-LE-NEXT:  .LBB15_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB15_6
-; CHECK-LE-NEXT:  .LBB15_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB15_7
-; CHECK-LE-NEXT:  .LBB15_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB15_8
-; CHECK-LE-NEXT:  .LBB15_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB15_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB15_10
-; CHECK-BE-NEXT:  .LBB15_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB15_11
-; CHECK-BE-NEXT:  .LBB15_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB15_12
-; CHECK-BE-NEXT:  .LBB15_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB15_13
-; CHECK-BE-NEXT:  .LBB15_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB15_14
-; CHECK-BE-NEXT:  .LBB15_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB15_15
-; CHECK-BE-NEXT:  .LBB15_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB15_16
-; CHECK-BE-NEXT:  .LBB15_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #8
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB15_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB15_2
-; CHECK-BE-NEXT:  .LBB15_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s0, s4
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB15_3
-; CHECK-BE-NEXT:  .LBB15_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB15_4
-; CHECK-BE-NEXT:  .LBB15_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s0, s5
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB15_5
-; CHECK-BE-NEXT:  .LBB15_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB15_6
-; CHECK-BE-NEXT:  .LBB15_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s0, s6
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB15_7
-; CHECK-BE-NEXT:  .LBB15_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s7, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB15_8
-; CHECK-BE-NEXT:  .LBB15_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s7
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <8 x i16> %b, zeroinitializer
@@ -1628,178 +447,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
 ; CHECK-LE-LABEL: masked_v8f16_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #40
-; CHECK-LE-NEXT:    sub sp, #40
-; CHECK-LE-NEXT:    add r1, sp, #32
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB16_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB16_10
-; CHECK-LE-NEXT:  .LBB16_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB16_11
-; CHECK-LE-NEXT:  .LBB16_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB16_12
-; CHECK-LE-NEXT:  .LBB16_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB16_13
-; CHECK-LE-NEXT:  .LBB16_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB16_14
-; CHECK-LE-NEXT:  .LBB16_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB16_15
-; CHECK-LE-NEXT:  .LBB16_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB16_16
-; CHECK-LE-NEXT:  .LBB16_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #40
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB16_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [sp, #28]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #28]
-; CHECK-LE-NEXT:    strh r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB16_2
-; CHECK-LE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #24]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #24]
-; CHECK-LE-NEXT:    strh r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB16_3
-; CHECK-LE-NEXT:  .LBB16_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [sp, #20]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #20]
-; CHECK-LE-NEXT:    strh r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB16_4
-; CHECK-LE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #16]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #16]
-; CHECK-LE-NEXT:    strh r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB16_5
-; CHECK-LE-NEXT:  .LBB16_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [sp, #12]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #12]
-; CHECK-LE-NEXT:    strh r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB16_6
-; CHECK-LE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #8]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #8]
-; CHECK-LE-NEXT:    strh r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB16_7
-; CHECK-LE-NEXT:  .LBB16_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [sp, #4]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #4]
-; CHECK-LE-NEXT:    strh r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB16_8
-; CHECK-LE-NEXT:  .LBB16_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [sp]
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    strh r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #40
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #40
-; CHECK-BE-NEXT:    sub sp, #40
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vrev16.8 q0, q2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    add r1, sp, #32
 ; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB16_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB16_10
-; CHECK-BE-NEXT:  .LBB16_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB16_11
-; CHECK-BE-NEXT:  .LBB16_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB16_12
-; CHECK-BE-NEXT:  .LBB16_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB16_13
-; CHECK-BE-NEXT:  .LBB16_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB16_14
-; CHECK-BE-NEXT:  .LBB16_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB16_15
-; CHECK-BE-NEXT:  .LBB16_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB16_16
-; CHECK-BE-NEXT:  .LBB16_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #40
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB16_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s4, [sp, #28]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #28]
-; CHECK-BE-NEXT:    strh r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB16_2
-; CHECK-BE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s0, s4
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #24]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #24]
-; CHECK-BE-NEXT:    strh r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB16_3
-; CHECK-BE-NEXT:  .LBB16_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s5, [sp, #20]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #20]
-; CHECK-BE-NEXT:    strh r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB16_4
-; CHECK-BE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s0, s5
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #16]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #16]
-; CHECK-BE-NEXT:    strh r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB16_5
-; CHECK-BE-NEXT:  .LBB16_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s6, [sp, #12]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #12]
-; CHECK-BE-NEXT:    strh r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB16_6
-; CHECK-BE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s0, s6
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #8]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #8]
-; CHECK-BE-NEXT:    strh r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB16_7
-; CHECK-BE-NEXT:  .LBB16_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #4]
-; CHECK-BE-NEXT:    strh r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB16_8
-; CHECK-BE-NEXT:  .LBB16_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s7
-; CHECK-BE-NEXT:    vstr.16 s0, [sp]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    strh r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #40
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <8 x i16> %b, zeroinitializer
@@ -1810,153 +470,25 @@ entry:
 define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    bne .LBB17_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB17_10
-; CHECK-LE-NEXT:  .LBB17_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB17_11
-; CHECK-LE-NEXT:  .LBB17_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB17_12
-; CHECK-LE-NEXT:  .LBB17_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB17_13
-; CHECK-LE-NEXT:  .LBB17_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB17_14
-; CHECK-LE-NEXT:  .LBB17_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB17_15
-; CHECK-LE-NEXT:  .LBB17_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB17_16
-; CHECK-LE-NEXT:  .LBB17_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB17_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB17_2
-; CHECK-LE-NEXT:  .LBB17_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB17_3
-; CHECK-LE-NEXT:  .LBB17_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB17_4
-; CHECK-LE-NEXT:  .LBB17_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB17_5
-; CHECK-LE-NEXT:  .LBB17_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB17_6
-; CHECK-LE-NEXT:  .LBB17_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB17_7
-; CHECK-LE-NEXT:  .LBB17_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB17_8
-; CHECK-LE-NEXT:  .LBB17_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    bne .LBB17_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB17_10
-; CHECK-BE-NEXT:  .LBB17_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB17_11
-; CHECK-BE-NEXT:  .LBB17_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB17_12
-; CHECK-BE-NEXT:  .LBB17_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB17_13
-; CHECK-BE-NEXT:  .LBB17_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB17_14
-; CHECK-BE-NEXT:  .LBB17_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB17_15
-; CHECK-BE-NEXT:  .LBB17_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB17_16
-; CHECK-BE-NEXT:  .LBB17_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #8
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB17_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB17_2
-; CHECK-BE-NEXT:  .LBB17_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB17_3
-; CHECK-BE-NEXT:  .LBB17_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB17_4
-; CHECK-BE-NEXT:  .LBB17_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB17_5
-; CHECK-BE-NEXT:  .LBB17_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB17_6
-; CHECK-BE-NEXT:  .LBB17_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s4, s2
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB17_7
-; CHECK-BE-NEXT:  .LBB17_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB17_8
-; CHECK-BE-NEXT:  .LBB17_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s3
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1971,146 +503,26 @@ entry:
 define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    bne .LBB18_12
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    bmi .LBB18_13
-; CHECK-LE-NEXT:  .LBB18_2: @ %else2
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    bmi .LBB18_14
-; CHECK-LE-NEXT:  .LBB18_3: @ %else4
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    bmi .LBB18_15
-; CHECK-LE-NEXT:  .LBB18_4: @ %else6
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    bmi .LBB18_16
-; CHECK-LE-NEXT:  .LBB18_5: @ %else8
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    bpl .LBB18_7
-; CHECK-LE-NEXT:  .LBB18_6: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:  .LBB18_7: @ %else10
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bpl .LBB18_9
-; CHECK-LE-NEXT:  @ %bb.8: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:  .LBB18_9: @ %else12
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bpl .LBB18_11
-; CHECK-LE-NEXT:  @ %bb.10: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:  .LBB18_11: @ %else14
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB18_12: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    bpl .LBB18_2
-; CHECK-LE-NEXT:  .LBB18_13: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    bpl .LBB18_3
-; CHECK-LE-NEXT:  .LBB18_14: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    bpl .LBB18_4
-; CHECK-LE-NEXT:  .LBB18_15: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    bpl .LBB18_5
-; CHECK-LE-NEXT:  .LBB18_16: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    bmi .LBB18_6
-; CHECK-LE-NEXT:    b .LBB18_7
 ;
 ; CHECK-BE-LABEL: masked_v8f16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    bne .LBB18_12
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    bmi .LBB18_13
-; CHECK-BE-NEXT:  .LBB18_2: @ %else2
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    bmi .LBB18_14
-; CHECK-BE-NEXT:  .LBB18_3: @ %else4
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    bmi .LBB18_15
-; CHECK-BE-NEXT:  .LBB18_4: @ %else6
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    bmi .LBB18_16
-; CHECK-BE-NEXT:  .LBB18_5: @ %else8
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    bpl .LBB18_7
-; CHECK-BE-NEXT:  .LBB18_6: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s4, s2
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:  .LBB18_7: @ %else10
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bpl .LBB18_9
-; CHECK-BE-NEXT:  @ %bb.8: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-BE-NEXT:  .LBB18_9: @ %else12
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bpl .LBB18_11
-; CHECK-BE-NEXT:  @ %bb.10: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s3
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:  .LBB18_11: @ %else14
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB18_12: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    bpl .LBB18_2
-; CHECK-BE-NEXT:  .LBB18_13: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    bpl .LBB18_3
-; CHECK-BE-NEXT:  .LBB18_14: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    bpl .LBB18_4
-; CHECK-BE-NEXT:  .LBB18_15: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    bpl .LBB18_5
-; CHECK-BE-NEXT:  .LBB18_16: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    bmi .LBB18_6
-; CHECK-BE-NEXT:    b .LBB18_7
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x half>*




More information about the llvm-commits mailing list