[llvm] r371932 - [ARM] Masked loads and stores

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 15 07:14:48 PDT 2019


Author: dmgreen
Date: Sun Sep 15 07:14:47 2019
New Revision: 371932

URL: http://llvm.org/viewvc/llvm-project?rev=371932&view=rev
Log:
[ARM] Masked loads and stores

Masked loads and store fit naturally with MVE, the instructions being easily
predicated. This adds lowering for the simple cases of masked loads and stores.
It does not yet deal with widening/narrowing or pre/post inc, and so is
currently behind an option.

The llvm masked load intrinsic will accept a "passthru" value, dictating the
values used for the zero masked lanes. In MVE the instructions write 0 to the
zero predicated lanes, so we need to match a passthru that isn't 0 (or undef)
with a select instruction to pull in the correct data after the load.

Differential Revision: https://reviews.llvm.org/D67186

Added:
    llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Sun Sep 15 07:14:47 2019
@@ -259,6 +259,8 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::UMAX, VT, Legal);
     setOperationAction(ISD::ABS, VT, Legal);
     setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
 
     // No native support for these.
     setOperationAction(ISD::UDIV, VT, Expand);
@@ -304,6 +306,8 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
     setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -8848,6 +8852,31 @@ static SDValue LowerPredicateStore(SDVal
       ST->getMemOperand());
 }
 
+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDValue PassThru = N->getPassThru();
+  SDLoc dl(Op);
+
+  if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+      (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+       isNullConstant(PassThru->getOperand(0))))
+    return Op;
+
+  // MVE Masked loads use zero as the passthru value. Here we convert undef to
+  // zero too, and other values are lowered to a select.
+  SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(0, dl, MVT::i32));
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+      N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+  SDValue Combo = NewLoad;
+  if (!PassThru.isUndef())
+    Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+  return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -9051,6 +9080,8 @@ SDValue ARMTargetLowering::LowerOperatio
     return LowerPredicateLoad(Op, DAG);
   case ISD::STORE:
     return LowerPredicateStore(Op, DAG);
+  case ISD::MLOAD:
+    return LowerMLOAD(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);

Modified: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrMVE.td?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td Sun Sep 15 07:14:47 2019
@@ -4892,6 +4892,10 @@ class MVE_vector_store_typed<ValueType T
                              PatFrag StoreKind, int shift>
   : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
         (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+                                   PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
 
 multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
                             int shift> {
@@ -4908,6 +4912,10 @@ class MVE_vector_load_typed<ValueType Ty
                             PatFrag LoadKind, int shift>
   : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
         (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+                                  PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
 
 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
                            int shift> {
@@ -4953,6 +4961,28 @@ def aligned16_post_store : PatFrag<(ops
   return cast<StoreSDNode>(N)->getAlignment() >= 2;
 }]>;
 
+def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                         (masked_ld node:$ptr, node:$pred, node:$passthru)>;
+
+def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                          (masked_st node:$val, node:$ptr, node:$pred)>;
+
 let Predicates = [HasMVEInt, IsLE] in {
   // Stores
   defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
@@ -4971,6 +5001,26 @@ let Predicates = [HasMVEInt, IsLE] in {
   defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
   defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
   defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
 }
 
 let Predicates = [HasMVEInt, IsBE] in {
@@ -5025,8 +5075,41 @@ let Predicates = [HasMVEInt, IsBE] in {
   def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
   def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
   def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
 }
 
+let Predicates = [HasMVEInt] in {
+  // Aligned masked store, shared between LE and BE
+  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore, 0>;
+  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  // Aligned masked loads
+  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload, 0>;
+  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+}
 
 // Widening/Narrowing Loads/Stores
 

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp Sun Sep 15 07:14:47 2019
@@ -36,6 +36,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "armtti"
 
+static cl::opt<bool> EnableMaskedLoadStores(
+  "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+  cl::desc("Enable the generation of masked loads and stores"));
+
 static cl::opt<bool> DisableLowOverheadLoops(
   "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
@@ -487,6 +491,22 @@ int ARMTTIImpl::getAddressComputationCos
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy) {
+  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
+    return false;
+
+  if (DataTy->isVectorTy()) {
+    // We don't yet support narrowing or widening masked loads/stores. Expand
+    // them for the moment.
+    unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+    if (VecWidth != 128)
+      return false;
+  }
+
+  unsigned EltWidth = DataTy->getScalarSizeInBits();
+  return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
+}
+
 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
   const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
   assert(MI && "MemcpyInst expected");

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h Sun Sep 15 07:14:47 2019
@@ -152,6 +152,9 @@ public:
     return ST->getMaxInterleaveFactor();
   }
 
+  bool isLegalMaskedLoad(Type *DataTy);
+  bool isLegalMaskedStore(Type *DataTy) { return isLegalMaskedLoad(DataTy); }
+
   int getMemcpyCost(const Instruction *I);
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll Sun Sep 15 07:14:47 2019
@@ -1,78 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
-; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 
 define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_v4i32_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #3, #1
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -85,8 +22,8 @@ entry:
 define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
 ; CHECK-LABEL: foo_sext_v4i32_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
@@ -121,40 +58,11 @@ define void @foo_sext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -168,8 +76,8 @@ entry:
 define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
 ; CHECK-LABEL: foo_sext_v4i32_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
@@ -204,39 +112,10 @@ define void @foo_sext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -250,8 +129,8 @@ entry:
 define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
 ; CHECK-LABEL: foo_zext_v4i32_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
@@ -287,39 +166,10 @@ define void @foo_zext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -333,8 +183,8 @@ entry:
 define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
 ; CHECK-LABEL: foo_zext_v4i32_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
@@ -369,39 +219,10 @@ define void @foo_zext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -415,130 +236,11 @@ entry:
 define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
 ; CHECK-LABEL: foo_v8i16_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #2, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #3, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #4, #1
-; CHECK-NEXT:    ubfx r1, r12, #10, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #5, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #6, #1
-; CHECK-NEXT:    ubfx r1, r12, #14, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r3
-; CHECK-NEXT:    lsls r3, r3, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrhne r3, [r2]
-; CHECK-NEXT:    vmovne.16 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r1, [r2, #14]
-; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #3, #1
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #4, #1
-; CHECK-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #5, #1
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r2, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
+; CHECK-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -551,8 +253,8 @@ entry:
 define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
 ; CHECK-LABEL: foo_sext_v8i16_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
@@ -615,67 +317,10 @@ define void @foo_sext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #3, #1
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #4, #1
-; CHECK-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #5, #1
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r2, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -689,8 +334,8 @@ entry:
 define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
 ; CHECK-LABEL: foo_zext_v8i16_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
@@ -753,67 +398,10 @@ define void @foo_zext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #3, #1
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #4, #1
-; CHECK-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #5, #1
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r2, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-NEXT:    strhne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-NEXT:    strhmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-NEXT:    strhmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-NEXT:    strhmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-NEXT:    strhmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-NEXT:    strhmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-NEXT:    strhmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-NEXT:    strhmi r1, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -827,153 +415,12 @@ entry:
 define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src) {
 ; CHECK-LABEL: foo_v16i8_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r6, r7, lr}
-; CHECK-NEXT:    .setfp r7, sp, #8
-; CHECK-NEXT:    add r7, sp, #8
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    bfc r4, #0, #4
-; CHECK-NEXT:    mov sp, r4
 ; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    sub.w r4, r7, #8
 ; CHECK-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r3, p0
-; CHECK-NEXT:    uxth r1, r3
-; CHECK-NEXT:    lsls r3, r3, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrbne r3, [r2]
-; CHECK-NEXT:    vmovne.8 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #1]
-; CHECK-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #3]
-; CHECK-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #5]
-; CHECK-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-NEXT:    lsls r3, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #7]
-; CHECK-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-NEXT:    lsls r3, r1, #23
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-NEXT:    lsls r3, r1, #22
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #9]
-; CHECK-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-NEXT:    lsls r3, r1, #21
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-NEXT:    lsls r3, r1, #20
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #11]
-; CHECK-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-NEXT:    lsls r3, r1, #19
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-NEXT:    lsls r3, r1, #18
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #13]
-; CHECK-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-NEXT:    lsls r3, r1, #17
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r3, [r2, #14]
-; CHECK-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-NEXT:    lsls r1, r1, #16
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrbmi r1, [r2, #15]
-; CHECK-NEXT:    vmovmi.8 q0[15], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    uxth r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-NEXT:    strbne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-NEXT:    strbmi r2, [r0, #1]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-NEXT:    strbmi r2, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-NEXT:    strbmi r2, [r0, #3]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-NEXT:    strbmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-NEXT:    strbmi r2, [r0, #5]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-NEXT:    strbmi r2, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-NEXT:    strbmi r2, [r0, #7]
-; CHECK-NEXT:    lsls r2, r1, #23
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-NEXT:    strbmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #22
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-NEXT:    strbmi r2, [r0, #9]
-; CHECK-NEXT:    lsls r2, r1, #21
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-NEXT:    strbmi r2, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #20
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-NEXT:    strbmi r2, [r0, #11]
-; CHECK-NEXT:    lsls r2, r1, #19
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-NEXT:    strbmi r2, [r0, #12]
-; CHECK-NEXT:    lsls r2, r1, #18
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-NEXT:    strbmi r2, [r0, #13]
-; CHECK-NEXT:    lsls r2, r1, #17
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-NEXT:    strbmi r2, [r0, #14]
-; CHECK-NEXT:    lsls r1, r1, #16
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-NEXT:    strbmi r1, [r0, #15]
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u8 q0, [r2]
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = load <16 x i8>, <16 x i8>* %mask, align 1
   %1 = icmp sgt <16 x i8> %0, zeroinitializer
@@ -985,98 +432,40 @@ entry:
 define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
 ; CHECK-LABEL: foo_trunc_v8i8_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #2, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #3, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #4, #1
-; CHECK-NEXT:    ubfx r1, r12, #10, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #5, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #6, #1
-; CHECK-NEXT:    ubfx r1, r12, #14, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r3
-; CHECK-NEXT:    lsls r3, r3, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrhne r3, [r2]
-; CHECK-NEXT:    vmovne.16 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #2]
-; CHECK-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #6]
-; CHECK-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #10]
-; CHECK-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r3, [r2, #12]
-; CHECK-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrhmi r1, [r2, #14]
-; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
 ; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #3, #1
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #4, #1
-; CHECK-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #5, #1
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    and r2, r1, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #0, #1
+; CHECK-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #1, #1
+; CHECK-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #2, #1
+; CHECK-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #3, #1
+; CHECK-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #4, #1
+; CHECK-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #5, #1
+; CHECK-NEXT:    ubfx r2, r1, #12, #1
 ; CHECK-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #6, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r2, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    lsls r2, r3, #31
+; CHECK-NEXT:    uxtb r1, r3
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-NEXT:    strbne r2, [r0]
@@ -1108,7 +497,7 @@ define void @foo_trunc_v8i8_v8i16(<8 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi.u16 r1, q0[7]
 ; CHECK-NEXT:    strbmi r1, [r0, #7]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
@@ -1122,57 +511,27 @@ entry:
 define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_trunc_v4i8_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r2, r1, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #0, #1
+; CHECK-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #1, #1
+; CHECK-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #2, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    bfi r3, r1, #3, #1
 ; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -1189,7 +548,7 @@ define void @foo_trunc_v4i8_v4i32(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi r1, s3
 ; CHECK-NEXT:    strbmi r1, [r0, #3]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -1203,57 +562,27 @@ entry:
 define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
 ; CHECK-LABEL: foo_trunc_v4i16_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r2, r1, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #0, #1
+; CHECK-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #1, #1
+; CHECK-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r3, r2, #2, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    bfi r3, r1, #3, #1
 ; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne r3, [r2]
-; CHECK-NEXT:    vmovne.32 q0[0], r3
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #4]
-; CHECK-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r3, [r2, #8]
-; CHECK-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    ldrmi r1, [r2, #12]
-; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -1270,7 +599,7 @@ define void @foo_trunc_v4i16_v4i32(<4 x
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    vmovmi r1, s3
 ; CHECK-NEXT:    strhmi r1, [r0, #6]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -1284,66 +613,11 @@ entry:
 define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *%src) {
 ; CHECK-LABEL: foo_v4f32_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #3, #1
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    vldrne s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s1, [r2, #4]
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s2, [r2, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vldrmi s3, [r2, #12]
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    and r1, r1, #15
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    vstrne s0, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q0, [r2]
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
@@ -1356,195 +630,11 @@ entry:
 define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%src) {
 ; CHECK-LABEL: foo_v8f16_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r1, r12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #0, #1
-; CHECK-NEXT:    ubfx r1, r12, #2, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r12, #4, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r12, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #3, #1
-; CHECK-NEXT:    ubfx r1, r12, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #4, #1
-; CHECK-NEXT:    ubfx r1, r12, #10, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #5, #1
-; CHECK-NEXT:    ubfx r1, r12, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #6, #1
-; CHECK-NEXT:    ubfx r1, r12, #14, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r3, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r3
-; CHECK-NEXT:    lsls r3, r3, #31
-; CHECK-NEXT:    bne .LBB13_18
-; CHECK-NEXT:  @ %bb.1: @ %else
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bmi .LBB13_19
-; CHECK-NEXT:  .LBB13_2: @ %else2
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bmi .LBB13_20
-; CHECK-NEXT:  .LBB13_3: @ %else5
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    bmi .LBB13_21
-; CHECK-NEXT:  .LBB13_4: @ %else8
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    bmi .LBB13_22
-; CHECK-NEXT:  .LBB13_5: @ %else11
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    bmi .LBB13_23
-; CHECK-NEXT:  .LBB13_6: @ %else14
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    bmi .LBB13_24
-; CHECK-NEXT:  .LBB13_7: @ %else17
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bpl .LBB13_9
-; CHECK-NEXT:  .LBB13_8: @ %cond.load19
-; CHECK-NEXT:    vldr.16 s4, [r2, #14]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:  .LBB13_9: @ %else20
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #3, #1
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #4, #1
-; CHECK-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #5, #1
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r2, r3, #6, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r2, r1, #7, #1
-; CHECK-NEXT:    uxtb r1, r2
-; CHECK-NEXT:    lsls r2, r2, #31
-; CHECK-NEXT:    bne .LBB13_25
-; CHECK-NEXT:  @ %bb.10: @ %else23
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    bmi .LBB13_26
-; CHECK-NEXT:  .LBB13_11: @ %else25
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    bmi .LBB13_27
-; CHECK-NEXT:  .LBB13_12: @ %else27
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    bmi .LBB13_28
-; CHECK-NEXT:  .LBB13_13: @ %else29
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    bmi .LBB13_29
-; CHECK-NEXT:  .LBB13_14: @ %else31
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    bmi .LBB13_30
-; CHECK-NEXT:  .LBB13_15: @ %else33
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    bmi .LBB13_31
-; CHECK-NEXT:  .LBB13_16: @ %else35
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bmi .LBB13_32
-; CHECK-NEXT:  .LBB13_17: @ %else37
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB13_18: @ %cond.load
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bpl .LBB13_2
-; CHECK-NEXT:  .LBB13_19: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[1], r3
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bpl .LBB13_3
-; CHECK-NEXT:  .LBB13_20: @ %cond.load4
-; CHECK-NEXT:    vldr.16 s4, [r2, #4]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[2], r3
-; CHECK-NEXT:    lsls r3, r1, #28
-; CHECK-NEXT:    bpl .LBB13_4
-; CHECK-NEXT:  .LBB13_21: @ %cond.load7
-; CHECK-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[3], r3
-; CHECK-NEXT:    lsls r3, r1, #27
-; CHECK-NEXT:    bpl .LBB13_5
-; CHECK-NEXT:  .LBB13_22: @ %cond.load10
-; CHECK-NEXT:    vldr.16 s4, [r2, #8]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[4], r3
-; CHECK-NEXT:    lsls r3, r1, #26
-; CHECK-NEXT:    bpl .LBB13_6
-; CHECK-NEXT:  .LBB13_23: @ %cond.load13
-; CHECK-NEXT:    vldr.16 s4, [r2, #10]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[5], r3
-; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    bpl.w .LBB13_7
-; CHECK-NEXT:  .LBB13_24: @ %cond.load16
-; CHECK-NEXT:    vldr.16 s4, [r2, #12]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bmi.w .LBB13_8
-; CHECK-NEXT:    b .LBB13_9
-; CHECK-NEXT:  .LBB13_25: @ %cond.store
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    bpl .LBB13_11
-; CHECK-NEXT:  .LBB13_26: @ %cond.store24
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    bpl .LBB13_12
-; CHECK-NEXT:  .LBB13_27: @ %cond.store26
-; CHECK-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #28
-; CHECK-NEXT:    bpl .LBB13_13
-; CHECK-NEXT:  .LBB13_28: @ %cond.store28
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-NEXT:    lsls r2, r1, #27
-; CHECK-NEXT:    bpl .LBB13_14
-; CHECK-NEXT:  .LBB13_29: @ %cond.store30
-; CHECK-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-NEXT:    lsls r2, r1, #26
-; CHECK-NEXT:    bpl .LBB13_15
-; CHECK-NEXT:  .LBB13_30: @ %cond.store32
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-NEXT:    lsls r2, r1, #25
-; CHECK-NEXT:    bpl .LBB13_16
-; CHECK-NEXT:  .LBB13_31: @ %cond.store34
-; CHECK-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bpl .LBB13_17
-; CHECK-NEXT:  .LBB13_32: @ %cond.store36
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrht.u16 q0, [r2]
+; CHECK-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll Sun Sep 15 07:14:47 2019
@@ -1,100 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
-; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_zero(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    mov.w r12, #0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #3, #1
-; CHECK-LE-NEXT:    and r1, r2, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB0_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    ldr r2, [r0]
-; CHECK-LE-NEXT:    vdup.32 q0, r12
-; CHECK-LE-NEXT:    vmov.32 q0[0], r2
-; CHECK-LE-NEXT:    b .LBB0_3
-; CHECK-LE-NEXT:  .LBB0_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB0_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    mov.w r12, #0
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #3, #1
-; CHECK-BE-NEXT:    and r1, r2, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB0_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    ldr r2, [r0]
-; CHECK-BE-NEXT:    vdup.32 q1, r12
-; CHECK-BE-NEXT:    vmov.32 q1[0], r2
-; CHECK-BE-NEXT:    b .LBB0_3
-; CHECK-BE-NEXT:  .LBB0_2:
-; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:  .LBB0_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -105,84 +27,18 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_undef(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -193,84 +49,19 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev32.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -281,82 +72,20 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_other(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q1[2], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -367,87 +96,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r2, r2, #15
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r2
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r2, r2, #15
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q0[2], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q0[3], r2
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -462,89 +125,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r3, r2, #15
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r3, r2, #15
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.32 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.32 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.32 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.32 q0[3], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -561,153 +156,23 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov.w r12, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r2, r3, #0
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-LE-NEXT:    ubfx r2, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r3, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r3
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    beq .LBB6_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    ldrh r2, [r0]
-; CHECK-LE-NEXT:    vdup.16 q0, r12
-; CHECK-LE-NEXT:    vmov.16 q0[0], r2
-; CHECK-LE-NEXT:    b .LBB6_3
-; CHECK-LE-NEXT:  .LBB6_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB6_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov.w r12, #0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r2, r3, #0
-; CHECK-BE-NEXT:    movs r3, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r3, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r3
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    beq .LBB6_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    ldrh r2, [r0]
-; CHECK-BE-NEXT:    vdup.16 q1, r12
-; CHECK-BE-NEXT:    vmov.16 q1[0], r2
-; CHECK-BE-NEXT:    b .LBB6_3
-; CHECK-BE-NEXT:  .LBB6_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.16 q1, q0
-; CHECK-BE-NEXT:  .LBB6_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vrev32.16 q1, q1
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -718,140 +183,18 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -862,140 +205,19 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev16.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -1006,138 +228,20 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_other(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q1[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -1148,147 +252,25 @@ entry:
 define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r2, r12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-LE-NEXT:    uxtb r2, r3
-; CHECK-LE-NEXT:    lsls r3, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r3, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r2
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #8
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r2, r12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-BE-NEXT:    uxtb r2, r3
-; CHECK-BE-NEXT:    lsls r3, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r3, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q0[6], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q0[7], r2
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -1303,145 +285,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-LE-NEXT:    uxtb r3, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrhne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.16 q0[7], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #8
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-BE-NEXT:    uxtb r3, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrhne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.16 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.16 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.16 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.16 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.16 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.16 q0[5], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.16 q0[6], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrhmi r0, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.16 q0[7], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -1457,184 +315,24 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    uxth r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    beq .LBB12_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldrb r3, [r0]
-; CHECK-LE-NEXT:    vdup.8 q0, r2
-; CHECK-LE-NEXT:    vmov.8 q0[0], r3
-; CHECK-LE-NEXT:    b .LBB12_3
-; CHECK-LE-NEXT:  .LBB12_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB12_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    beq .LBB12_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldrb r3, [r0]
-; CHECK-BE-NEXT:    vdup.8 q1, r2
-; CHECK-BE-NEXT:    vmov.8 q1[0], r3
-; CHECK-BE-NEXT:    b .LBB12_3
-; CHECK-BE-NEXT:  .LBB12_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.8 q1, q0
-; CHECK-BE-NEXT:  .LBB12_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vrev32.8 q1, q1
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> zeroinitializer)
@@ -1644,171 +342,19 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_undef(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    uxth r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> undef)
@@ -1818,169 +364,21 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_other(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    uxth r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q1, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q1[6], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q1[7], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q1[8], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q1[9], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q1[10], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q1[11], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q1[12], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q1[13], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q1[14], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q1[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   %l = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %dest, i32 1, <16 x i1> %c, <16 x i8> %a)
@@ -1990,174 +388,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0, #4]
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    vmrs r3, p0
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    uxth r2, r3
-; CHECK-LE-NEXT:    lsls r3, r3, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r3, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r3, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r2
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vmrs r3, p0
-; CHECK-BE-NEXT:    uxth r2, r3
-; CHECK-BE-NEXT:    lsls r3, r3, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r3, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q0[0], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q0[6], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q0[7], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q0[8], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q0[9], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q0[10], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q0[11], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q0[12], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q0[13], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r3, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q0[14], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q0[15], r2
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <16 x i8>*
@@ -2171,176 +417,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    uxth r3, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrbne r2, [r0]
-; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-LE-NEXT:    vmovmi.8 q0[15], r0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r3, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrbne r2, [r0]
-; CHECK-BE-NEXT:    vmovne.8 q0[0], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    vmovmi.8 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    vmovmi.8 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    vmovmi.8 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi.8 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    vmovmi.8 q0[5], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    vmovmi.8 q0[6], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    vmovmi.8 q0[7], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi.8 q0[8], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    vmovmi.8 q0[9], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    vmovmi.8 q0[10], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    vmovmi.8 q0[11], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi.8 q0[12], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    vmovmi.8 q0[13], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    vmovmi.8 q0[14], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrbmi r0, [r0, #15]
-; CHECK-BE-NEXT:    vmovmi.8 q0[15], r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -2355,101 +447,23 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    beq .LBB17_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    vldr s0, .LCPI17_0
-; CHECK-LE-NEXT:    vldr s4, [r0]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vdup.32 q0, r2
-; CHECK-LE-NEXT:    vmov.f32 s0, s4
-; CHECK-LE-NEXT:    b .LBB17_3
-; CHECK-LE-NEXT:  .LBB17_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:  .LBB17_3: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:    .p2align 2
-; CHECK-LE-NEXT:  @ %bb.4:
-; CHECK-LE-NEXT:  .LCPI17_0:
-; CHECK-LE-NEXT:    .long 0 @ float 0
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    beq .LBB17_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    vldr s0, .LCPI17_0
-; CHECK-BE-NEXT:    vldr s2, [r0]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vdup.32 q1, r2
-; CHECK-BE-NEXT:    vmov.f32 s4, s2
-; CHECK-BE-NEXT:    b .LBB17_3
-; CHECK-BE-NEXT:  .LBB17_2:
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-BE-NEXT:  .LBB17_3: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s7, [r0, #12]
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:    .p2align 2
-; CHECK-BE-NEXT:  @ %bb.4:
-; CHECK-BE-NEXT:  .LCPI17_0:
-; CHECK-BE-NEXT:    .long 0 @ float 0
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
   %l = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %dest, i32 4, <4 x i1> %c, <4 x float> zeroinitializer)
@@ -2459,76 +473,18 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_undef(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s7, [r0, #12]
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2539,84 +495,19 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    ldrne r2, [r0]
-; CHECK-LE-NEXT:    vmovne s0, r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-LE-NEXT:    vmovmi s1, r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-LE-NEXT:    vmovmi s2, r2
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-LE-NEXT:    vmovmi s3, r0
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    ldrne r2, [r0]
-; CHECK-BE-NEXT:    vmovne s4, r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #4]
-; CHECK-BE-NEXT:    vmovmi s5, r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r2, [r0, #8]
-; CHECK-BE-NEXT:    vmovmi s6, r2
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    ldrmi r0, [r0, #12]
-; CHECK-BE-NEXT:    vmovmi s7, r0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev32.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2627,76 +518,21 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_other(<4 x float> *%dest, <4 x i32> %a, <4 x float> %b) {
 ; CHECK-LE-LABEL: masked_v4f32_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s4, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s5, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s6, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s7, [r0, #12]
-; CHECK-LE-NEXT:    vmov q0, q1
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
-; CHECK-BE-NEXT:    vrev64.32 q2, q0
-; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s8, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s9, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s10, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s11, [r0, #12]
-; CHECK-BE-NEXT:    vrev64.32 q0, q2
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q2
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -2707,79 +543,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r2, r2, #15
-; CHECK-LE-NEXT:    lsls r3, r2, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r2, r2, #15
-; CHECK-BE-NEXT:    lsls r3, r2, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s3, [r0, #12]
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -2794,81 +572,21 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r3, r2, #15
-; CHECK-LE-NEXT:    lsls r2, r3, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vldrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r3, r2, #15
-; CHECK-BE-NEXT:    lsls r2, r3, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vldrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vldrmi s3, [r0, #12]
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
@@ -2884,233 +602,24 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    beq .LBB23_2
-; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    vmov r3, s0
-; CHECK-LE-NEXT:    vdup.16 q0, r2
-; CHECK-LE-NEXT:    vmov.16 q0[0], r3
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB23_3
-; CHECK-LE-NEXT:    b .LBB23_4
-; CHECK-LE-NEXT:  .LBB23_2:
-; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB23_4
-; CHECK-LE-NEXT:  .LBB23_3: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:  .LBB23_4: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB23_11
-; CHECK-LE-NEXT:  @ %bb.5: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB23_12
-; CHECK-LE-NEXT:  .LBB23_6: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB23_13
-; CHECK-LE-NEXT:  .LBB23_7: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB23_14
-; CHECK-LE-NEXT:  .LBB23_8: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB23_15
-; CHECK-LE-NEXT:  .LBB23_9: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB23_16
-; CHECK-LE-NEXT:  .LBB23_10: @ %else20
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB23_11: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB23_6
-; CHECK-LE-NEXT:  .LBB23_12: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB23_7
-; CHECK-LE-NEXT:  .LBB23_13: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB23_8
-; CHECK-LE-NEXT:  .LBB23_14: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB23_9
-; CHECK-LE-NEXT:  .LBB23_15: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB23_10
-; CHECK-LE-NEXT:  .LBB23_16: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:    .p2align 1
-; CHECK-LE-NEXT:  @ %bb.17:
-; CHECK-LE-NEXT:  .LCPI23_0:
-; CHECK-LE-NEXT:    .short 0 @ half 0
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    beq .LBB23_2
-; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    vdup.16 q1, r2
-; CHECK-BE-NEXT:    vmov r3, s0
-; CHECK-BE-NEXT:    vmov.16 q1[0], r3
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB23_3
-; CHECK-BE-NEXT:    b .LBB23_4
-; CHECK-BE-NEXT:  .LBB23_2:
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vrev32.16 q1, q0
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB23_4
-; CHECK-BE-NEXT:  .LBB23_3: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:  .LBB23_4: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB23_12
-; CHECK-BE-NEXT:  @ %bb.5: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB23_13
-; CHECK-BE-NEXT:  .LBB23_6: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB23_14
-; CHECK-BE-NEXT:  .LBB23_7: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB23_15
-; CHECK-BE-NEXT:  .LBB23_8: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB23_16
-; CHECK-BE-NEXT:  .LBB23_9: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB23_11
-; CHECK-BE-NEXT:  .LBB23_10: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB23_11: @ %else20
+; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vrev32.16 q1, q1
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB23_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB23_6
-; CHECK-BE-NEXT:  .LBB23_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB23_7
-; CHECK-BE-NEXT:  .LBB23_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB23_8
-; CHECK-BE-NEXT:  .LBB23_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB23_9
-; CHECK-BE-NEXT:  .LBB23_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB23_10
-; CHECK-BE-NEXT:    b .LBB23_11
-; CHECK-BE-NEXT:    .p2align 1
-; CHECK-BE-NEXT:  @ %bb.17:
-; CHECK-BE-NEXT:  .LCPI23_0:
-; CHECK-BE-NEXT:    .short 0 @ half 0
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> zeroinitializer)
@@ -3120,216 +629,19 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_undef(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB24_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB24_10
-; CHECK-LE-NEXT:  .LBB24_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB24_11
-; CHECK-LE-NEXT:  .LBB24_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB24_12
-; CHECK-LE-NEXT:  .LBB24_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB24_13
-; CHECK-LE-NEXT:  .LBB24_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB24_14
-; CHECK-LE-NEXT:  .LBB24_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB24_15
-; CHECK-LE-NEXT:  .LBB24_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB24_16
-; CHECK-LE-NEXT:  .LBB24_8: @ %else20
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB24_9: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB24_2
-; CHECK-LE-NEXT:  .LBB24_10: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB24_3
-; CHECK-LE-NEXT:  .LBB24_11: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB24_4
-; CHECK-LE-NEXT:  .LBB24_12: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB24_5
-; CHECK-LE-NEXT:  .LBB24_13: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB24_6
-; CHECK-LE-NEXT:  .LBB24_14: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB24_7
-; CHECK-LE-NEXT:  .LBB24_15: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB24_8
-; CHECK-LE-NEXT:  .LBB24_16: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB24_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB24_11
-; CHECK-BE-NEXT:  .LBB24_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB24_12
-; CHECK-BE-NEXT:  .LBB24_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB24_13
-; CHECK-BE-NEXT:  .LBB24_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB24_14
-; CHECK-BE-NEXT:  .LBB24_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB24_15
-; CHECK-BE-NEXT:  .LBB24_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB24_16
-; CHECK-BE-NEXT:  .LBB24_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB24_9
-; CHECK-BE-NEXT:  .LBB24_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB24_9: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q1, [r0]
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB24_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB24_2
-; CHECK-BE-NEXT:  .LBB24_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB24_3
-; CHECK-BE-NEXT:  .LBB24_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB24_4
-; CHECK-BE-NEXT:  .LBB24_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB24_5
-; CHECK-BE-NEXT:  .LBB24_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB24_6
-; CHECK-BE-NEXT:  .LBB24_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB24_7
-; CHECK-BE-NEXT:  .LBB24_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB24_8
-; CHECK-BE-NEXT:    b .LBB24_9
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> undef)
@@ -3339,248 +651,20 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_align1_undef:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #40
-; CHECK-LE-NEXT:    sub sp, #40
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB25_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB25_10
-; CHECK-LE-NEXT:  .LBB25_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB25_11
-; CHECK-LE-NEXT:  .LBB25_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB25_12
-; CHECK-LE-NEXT:  .LBB25_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB25_13
-; CHECK-LE-NEXT:  .LBB25_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB25_14
-; CHECK-LE-NEXT:  .LBB25_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB25_15
-; CHECK-LE-NEXT:  .LBB25_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB25_16
-; CHECK-LE-NEXT:  .LBB25_8: @ %else20
-; CHECK-LE-NEXT:    add sp, #40
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB25_9: @ %cond.load
-; CHECK-LE-NEXT:    ldrh r2, [r0]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #28]
-; CHECK-LE-NEXT:    vldr.16 s0, [sp, #28]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB25_2
-; CHECK-LE-NEXT:  .LBB25_10: @ %cond.load1
-; CHECK-LE-NEXT:    ldrh r2, [r0, #2]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #24]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #24]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB25_3
-; CHECK-LE-NEXT:  .LBB25_11: @ %cond.load4
-; CHECK-LE-NEXT:    ldrh r2, [r0, #4]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #20]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #20]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB25_4
-; CHECK-LE-NEXT:  .LBB25_12: @ %cond.load7
-; CHECK-LE-NEXT:    ldrh r2, [r0, #6]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #16]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #16]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB25_5
-; CHECK-LE-NEXT:  .LBB25_13: @ %cond.load10
-; CHECK-LE-NEXT:    ldrh r2, [r0, #8]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #12]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB25_6
-; CHECK-LE-NEXT:  .LBB25_14: @ %cond.load13
-; CHECK-LE-NEXT:    ldrh r2, [r0, #10]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #8]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB25_7
-; CHECK-LE-NEXT:  .LBB25_15: @ %cond.load16
-; CHECK-LE-NEXT:    ldrh r2, [r0, #12]
-; CHECK-LE-NEXT:    strh.w r2, [sp, #4]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB25_8
-; CHECK-LE-NEXT:  .LBB25_16: @ %cond.load19
-; CHECK-LE-NEXT:    ldrh r0, [r0, #14]
-; CHECK-LE-NEXT:    strh.w r0, [sp]
-; CHECK-LE-NEXT:    vldr.16 s4, [sp]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:    add sp, #40
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align1_undef:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #40
-; CHECK-BE-NEXT:    sub sp, #40
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB25_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB25_11
-; CHECK-BE-NEXT:  .LBB25_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB25_12
-; CHECK-BE-NEXT:  .LBB25_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB25_13
-; CHECK-BE-NEXT:  .LBB25_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB25_14
-; CHECK-BE-NEXT:  .LBB25_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB25_15
-; CHECK-BE-NEXT:  .LBB25_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB25_16
-; CHECK-BE-NEXT:  .LBB25_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB25_9
-; CHECK-BE-NEXT:  .LBB25_8: @ %cond.load19
-; CHECK-BE-NEXT:    ldrh r0, [r0, #14]
-; CHECK-BE-NEXT:    strh.w r0, [sp]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q1[7], r0
-; CHECK-BE-NEXT:  .LBB25_9: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-BE-NEXT:    vrev16.8 q1, q0
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
-; CHECK-BE-NEXT:    add sp, #40
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB25_10: @ %cond.load
-; CHECK-BE-NEXT:    ldrh r2, [r0]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #28]
-; CHECK-BE-NEXT:    vldr.16 s4, [sp, #28]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB25_2
-; CHECK-BE-NEXT:  .LBB25_11: @ %cond.load1
-; CHECK-BE-NEXT:    ldrh r2, [r0, #2]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #24]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #24]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB25_3
-; CHECK-BE-NEXT:  .LBB25_12: @ %cond.load4
-; CHECK-BE-NEXT:    ldrh r2, [r0, #4]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #20]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #20]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB25_4
-; CHECK-BE-NEXT:  .LBB25_13: @ %cond.load7
-; CHECK-BE-NEXT:    ldrh r2, [r0, #6]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #16]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #16]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB25_5
-; CHECK-BE-NEXT:  .LBB25_14: @ %cond.load10
-; CHECK-BE-NEXT:    ldrh r2, [r0, #8]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #12]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB25_6
-; CHECK-BE-NEXT:  .LBB25_15: @ %cond.load13
-; CHECK-BE-NEXT:    ldrh r2, [r0, #10]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #8]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB25_7
-; CHECK-BE-NEXT:  .LBB25_16: @ %cond.load16
-; CHECK-BE-NEXT:    ldrh r2, [r0, #12]
-; CHECK-BE-NEXT:    strh.w r2, [sp, #4]
-; CHECK-BE-NEXT:    vldr.16 s0, [sp, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q1[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB25_8
-; CHECK-BE-NEXT:    b .LBB25_9
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 1, <8 x i1> %c, <8 x half> undef)
@@ -3590,219 +674,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_other(<8 x half> *%dest, <8 x i16> %a, <8 x half> %b) {
 ; CHECK-LE-LABEL: masked_v8f16_align4_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB26_10
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB26_11
-; CHECK-LE-NEXT:  .LBB26_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB26_12
-; CHECK-LE-NEXT:  .LBB26_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB26_13
-; CHECK-LE-NEXT:  .LBB26_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB26_14
-; CHECK-LE-NEXT:  .LBB26_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB26_15
-; CHECK-LE-NEXT:  .LBB26_6: @ %else14
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB26_16
-; CHECK-LE-NEXT:  .LBB26_7: @ %else17
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB26_9
-; CHECK-LE-NEXT:  .LBB26_8: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s0
-; CHECK-LE-NEXT:    vmov.16 q1[7], r0
-; CHECK-LE-NEXT:  .LBB26_9: @ %else20
-; CHECK-LE-NEXT:    vmov q0, q1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB26_10: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[0], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB26_2
-; CHECK-LE-NEXT:  .LBB26_11: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[1], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB26_3
-; CHECK-LE-NEXT:  .LBB26_12: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[2], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB26_4
-; CHECK-LE-NEXT:  .LBB26_13: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[3], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB26_5
-; CHECK-LE-NEXT:  .LBB26_14: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[4], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB26_6
-; CHECK-LE-NEXT:  .LBB26_15: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[5], r2
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB26_7
-; CHECK-LE-NEXT:  .LBB26_16: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov.16 q1[6], r2
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB26_8
-; CHECK-LE-NEXT:    b .LBB26_9
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align4_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vrev64.16 q2, q0
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB26_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB26_11
-; CHECK-BE-NEXT:  .LBB26_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB26_12
-; CHECK-BE-NEXT:  .LBB26_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB26_13
-; CHECK-BE-NEXT:  .LBB26_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB26_14
-; CHECK-BE-NEXT:  .LBB26_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB26_15
-; CHECK-BE-NEXT:  .LBB26_6: @ %else14
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB26_16
-; CHECK-BE-NEXT:  .LBB26_7: @ %else17
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB26_9
-; CHECK-BE-NEXT:  .LBB26_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s0
-; CHECK-BE-NEXT:    vmov.16 q2[7], r0
-; CHECK-BE-NEXT:  .LBB26_9: @ %else20
-; CHECK-BE-NEXT:    vrev64.16 q0, q2
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q1, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    vpsel q1, q0, q2
+; CHECK-BE-NEXT:    vrev64.16 q0, q1
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB26_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[0], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB26_2
-; CHECK-BE-NEXT:  .LBB26_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[1], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB26_3
-; CHECK-BE-NEXT:  .LBB26_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[2], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB26_4
-; CHECK-BE-NEXT:  .LBB26_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[3], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB26_5
-; CHECK-BE-NEXT:  .LBB26_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[4], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB26_6
-; CHECK-BE-NEXT:  .LBB26_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[5], r2
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB26_7
-; CHECK-BE-NEXT:  .LBB26_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s0, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s0
-; CHECK-BE-NEXT:    vmov.16 q2[6], r2
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB26_8
-; CHECK-BE-NEXT:    b .LBB26_9
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
   %l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 2, <8 x i1> %c, <8 x half> %b)
@@ -3812,218 +699,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r2, r12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-LE-NEXT:    uxtb r2, r3
-; CHECK-LE-NEXT:    lsls r3, r3, #31
-; CHECK-LE-NEXT:    bne .LBB27_10
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    bmi .LBB27_11
-; CHECK-LE-NEXT:  .LBB27_2: @ %else2
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    bmi .LBB27_12
-; CHECK-LE-NEXT:  .LBB27_3: @ %else5
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    bmi .LBB27_13
-; CHECK-LE-NEXT:  .LBB27_4: @ %else8
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    bmi .LBB27_14
-; CHECK-LE-NEXT:  .LBB27_5: @ %else11
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    bmi .LBB27_15
-; CHECK-LE-NEXT:  .LBB27_6: @ %else14
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bmi .LBB27_16
-; CHECK-LE-NEXT:  .LBB27_7: @ %else17
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bpl .LBB27_9
-; CHECK-LE-NEXT:  .LBB27_8: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r2
-; CHECK-LE-NEXT:  .LBB27_9: @ %else20
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    add sp, #8
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB27_10: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r3, r2, #30
-; CHECK-LE-NEXT:    bpl .LBB27_2
-; CHECK-LE-NEXT:  .LBB27_11: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    bpl .LBB27_3
-; CHECK-LE-NEXT:  .LBB27_12: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #28
-; CHECK-LE-NEXT:    bpl .LBB27_4
-; CHECK-LE-NEXT:  .LBB27_13: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #27
-; CHECK-LE-NEXT:    bpl .LBB27_5
-; CHECK-LE-NEXT:  .LBB27_14: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #26
-; CHECK-LE-NEXT:    bpl .LBB27_6
-; CHECK-LE-NEXT:  .LBB27_15: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r3
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bpl .LBB27_7
-; CHECK-LE-NEXT:  .LBB27_16: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r3
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bmi .LBB27_8
-; CHECK-LE-NEXT:    b .LBB27_9
 ;
 ; CHECK-BE-LABEL: masked_v8f16_preinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    and r2, r12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-BE-NEXT:    uxtb r2, r3
-; CHECK-BE-NEXT:    lsls r3, r3, #31
-; CHECK-BE-NEXT:    bne .LBB27_10
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    bmi .LBB27_11
-; CHECK-BE-NEXT:  .LBB27_2: @ %else2
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    bmi .LBB27_12
-; CHECK-BE-NEXT:  .LBB27_3: @ %else5
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    bmi .LBB27_13
-; CHECK-BE-NEXT:  .LBB27_4: @ %else8
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    bmi .LBB27_14
-; CHECK-BE-NEXT:  .LBB27_5: @ %else11
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    bmi .LBB27_15
-; CHECK-BE-NEXT:  .LBB27_6: @ %else14
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bmi .LBB27_16
-; CHECK-BE-NEXT:  .LBB27_7: @ %else17
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bpl .LBB27_9
-; CHECK-BE-NEXT:  .LBB27_8: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[7], r2
-; CHECK-BE-NEXT:  .LBB27_9: @ %else20
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB27_10: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r3, r2, #30
-; CHECK-BE-NEXT:    bpl .LBB27_2
-; CHECK-BE-NEXT:  .LBB27_11: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[1], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    bpl .LBB27_3
-; CHECK-BE-NEXT:  .LBB27_12: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[2], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #28
-; CHECK-BE-NEXT:    bpl .LBB27_4
-; CHECK-BE-NEXT:  .LBB27_13: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[3], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #27
-; CHECK-BE-NEXT:    bpl .LBB27_5
-; CHECK-BE-NEXT:  .LBB27_14: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[4], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #26
-; CHECK-BE-NEXT:    bpl .LBB27_6
-; CHECK-BE-NEXT:  .LBB27_15: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[5], r3
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bpl .LBB27_7
-; CHECK-BE-NEXT:  .LBB27_16: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-BE-NEXT:    vmov r3, s4
-; CHECK-BE-NEXT:    vmov.16 q0[6], r3
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bmi .LBB27_8
-; CHECK-BE-NEXT:    b .LBB27_9
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %z to <8 x half>*
@@ -4037,212 +728,22 @@ entry:
 define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-LE-NEXT:    uxtb r3, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB28_12
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    bmi .LBB28_13
-; CHECK-LE-NEXT:  .LBB28_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    bmi .LBB28_14
-; CHECK-LE-NEXT:  .LBB28_3: @ %else5
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    bmi .LBB28_15
-; CHECK-LE-NEXT:  .LBB28_4: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    bmi .LBB28_16
-; CHECK-LE-NEXT:  .LBB28_5: @ %else11
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    bpl .LBB28_7
-; CHECK-LE-NEXT:  .LBB28_6: @ %cond.load13
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[5], r2
-; CHECK-LE-NEXT:  .LBB28_7: @ %else14
-; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    lsls r2, r3, #25
-; CHECK-LE-NEXT:    bpl .LBB28_9
-; CHECK-LE-NEXT:  @ %bb.8: @ %cond.load16
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[6], r2
-; CHECK-LE-NEXT:  .LBB28_9: @ %else17
-; CHECK-LE-NEXT:    lsls r2, r3, #24
-; CHECK-LE-NEXT:    bpl .LBB28_11
-; CHECK-LE-NEXT:  @ %bb.10: @ %cond.load19
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-LE-NEXT:    vmov r0, s4
-; CHECK-LE-NEXT:    vmov.16 q0[7], r0
-; CHECK-LE-NEXT:  .LBB28_11: @ %else20
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LE-NEXT:    mov r0, r12
-; CHECK-LE-NEXT:    add sp, #8
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB28_12: @ %cond.load
-; CHECK-LE-NEXT:    vldr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r3, #30
-; CHECK-LE-NEXT:    bpl .LBB28_2
-; CHECK-LE-NEXT:  .LBB28_13: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[1], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #29
-; CHECK-LE-NEXT:    bpl .LBB28_3
-; CHECK-LE-NEXT:  .LBB28_14: @ %cond.load4
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[2], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #28
-; CHECK-LE-NEXT:    bpl .LBB28_4
-; CHECK-LE-NEXT:  .LBB28_15: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[3], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #27
-; CHECK-LE-NEXT:    bpl .LBB28_5
-; CHECK-LE-NEXT:  .LBB28_16: @ %cond.load10
-; CHECK-LE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-LE-NEXT:    vmov r2, s4
-; CHECK-LE-NEXT:    vmov.16 q0[4], r2
-; CHECK-LE-NEXT:    lsls r2, r3, #26
-; CHECK-LE-NEXT:    bmi .LBB28_6
-; CHECK-LE-NEXT:    b .LBB28_7
 ;
 ; CHECK-BE-LABEL: masked_v8f16_postinc:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-BE-NEXT:    uxtb r3, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB28_12
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    bmi .LBB28_13
-; CHECK-BE-NEXT:  .LBB28_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    bmi .LBB28_14
-; CHECK-BE-NEXT:  .LBB28_3: @ %else5
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    bmi .LBB28_15
-; CHECK-BE-NEXT:  .LBB28_4: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    bmi .LBB28_16
-; CHECK-BE-NEXT:  .LBB28_5: @ %else11
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    bpl .LBB28_7
-; CHECK-BE-NEXT:  .LBB28_6: @ %cond.load13
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[5], r2
-; CHECK-BE-NEXT:  .LBB28_7: @ %else14
-; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    lsls r2, r3, #25
-; CHECK-BE-NEXT:    bpl .LBB28_9
-; CHECK-BE-NEXT:  @ %bb.8: @ %cond.load16
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #12]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[6], r2
-; CHECK-BE-NEXT:  .LBB28_9: @ %else17
-; CHECK-BE-NEXT:    lsls r2, r3, #24
-; CHECK-BE-NEXT:    bpl .LBB28_11
-; CHECK-BE-NEXT:  @ %bb.10: @ %cond.load19
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #14]
-; CHECK-BE-NEXT:    vmov r0, s4
-; CHECK-BE-NEXT:    vmov.16 q0[7], r0
-; CHECK-BE-NEXT:  .LBB28_11: @ %else20
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
-; CHECK-BE-NEXT:    mov r0, r12
-; CHECK-BE-NEXT:    add sp, #8
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB28_12: @ %cond.load
-; CHECK-BE-NEXT:    vldr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r3, #30
-; CHECK-BE-NEXT:    bpl .LBB28_2
-; CHECK-BE-NEXT:  .LBB28_13: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[1], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #29
-; CHECK-BE-NEXT:    bpl .LBB28_3
-; CHECK-BE-NEXT:  .LBB28_14: @ %cond.load4
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #4]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[2], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #28
-; CHECK-BE-NEXT:    bpl .LBB28_4
-; CHECK-BE-NEXT:  .LBB28_15: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[3], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #27
-; CHECK-BE-NEXT:    bpl .LBB28_5
-; CHECK-BE-NEXT:  .LBB28_16: @ %cond.load10
-; CHECK-BE-NEXT:    vldr.16 s4, [r0, #8]
-; CHECK-BE-NEXT:    vmov r2, s4
-; CHECK-BE-NEXT:    vmov.16 q0[4], r2
-; CHECK-BE-NEXT:    lsls r2, r3, #26
-; CHECK-BE-NEXT:    bmi .LBB28_6
-; CHECK-BE-NEXT:    b .LBB28_7
 entry:
   %z = getelementptr inbounds i8, i8* %x, i32 4
   %0 = bitcast i8* %x to <8 x half>*

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll?rev=371932&r1=371931&r2=371932&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll Sun Sep 15 07:14:47 2019
@@ -1,85 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
-; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 
 define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s0
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s1
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s2
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r2, s4
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s7
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -90,81 +26,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s0
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s1
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s2
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    vrev32.8 q0, q1
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r2, s4
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s7
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -175,89 +48,25 @@ entry:
 define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r1, r2, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s0
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s1
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s2
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s3
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r1, r2, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r2, s0
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s1
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s2
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s3
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -272,91 +81,25 @@ entry:
 define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r2, r2, #15
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r1, s0
-; CHECK-LE-NEXT:    strne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s1
-; CHECK-LE-NEXT:    strmi r1, [r0, #4]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r3, s2
-; CHECK-LE-NEXT:    strmi r3, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s3
-; CHECK-LE-NEXT:    strmi r2, [r0, #12]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r2, r2, #15
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s0
-; CHECK-BE-NEXT:    strne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s1
-; CHECK-BE-NEXT:    strmi r1, [r0, #4]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r3, s2
-; CHECK-BE-NEXT:    strmi r3, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s3
-; CHECK-BE-NEXT:    strmi r2, [r0, #12]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -372,137 +115,17 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-LE-NEXT:    strhne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
-; CHECK-BE-NEXT:    strhne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[1]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[2]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[3]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[4]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[5]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[6]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q1[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -513,137 +136,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-LE-NEXT:    strhne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
+; CHECK-BE-NEXT:    vrev16.8 q0, q1
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
-; CHECK-BE-NEXT:    strhne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[1]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[2]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[3]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[4]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[5]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q1[6]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q1[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -654,145 +158,25 @@ entry:
 define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-LE-NEXT:    strhne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r2, q0[0]
-; CHECK-BE-NEXT:    strhne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[1]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[2]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[3]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[4]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[5]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[6]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[7]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -807,147 +191,25 @@ entry:
 define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r2, r12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-LE-NEXT:    lsls r1, r3, #31
-; CHECK-LE-NEXT:    uxtb r2, r3
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-LE-NEXT:    strhne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    add.w r1, r0, #4
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r3, q0[6]
-; CHECK-LE-NEXT:    strhmi r3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[7]
-; CHECK-LE-NEXT:    strhmi r2, [r0, #14]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r3, #0
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r2, r12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-BE-NEXT:    lsls r1, r3, #31
-; CHECK-BE-NEXT:    uxtb r2, r3
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-BE-NEXT:    strhne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    add.w r1, r0, #4
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r3, q0[6]
-; CHECK-BE-NEXT:    strhmi r3, [r0, #12]
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[7]
-; CHECK-BE-NEXT:    strhmi r2, [r0, #14]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -963,168 +225,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    uxth r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-LE-NEXT:    strbne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r2, q1[0]
-; CHECK-BE-NEXT:    strbne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[1]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[2]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[3]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[4]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[5]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[6]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[7]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[8]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[9]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[10]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[11]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[12]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[13]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q1[14]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q1[15]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <16 x i8> %a, zeroinitializer
   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %a, <16 x i8>* %dest, i32 1, <16 x i1> %c)
@@ -1134,176 +246,26 @@ entry:
 define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    vldr d1, [r7, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    uxth r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-LE-NEXT:    strbne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-LE-NEXT:    lsls r2, r1, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-LE-NEXT:    lsls r2, r1, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-LE-NEXT:    lsls r2, r1, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r2, r1, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-LE-NEXT:    lsls r2, r1, #17
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-LE-NEXT:    lsls r1, r1, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vldr d1, [r7, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    uxth r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r2, q0[0]
-; CHECK-BE-NEXT:    strbne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[1]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #1]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[2]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[3]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #3]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[4]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[5]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #5]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[6]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[7]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #7]
-; CHECK-BE-NEXT:    lsls r2, r1, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[8]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[9]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #9]
-; CHECK-BE-NEXT:    lsls r2, r1, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[10]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[11]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #11]
-; CHECK-BE-NEXT:    lsls r2, r1, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[12]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r2, r1, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[13]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #13]
-; CHECK-BE-NEXT:    lsls r2, r1, #17
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[14]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #14]
-; CHECK-BE-NEXT:    lsls r1, r1, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[15]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #15]
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -1317,178 +279,26 @@ entry:
 define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-LE-NEXT:    .setfp r7, sp, #8
-; CHECK-LE-NEXT:    add r7, sp, #8
-; CHECK-LE-NEXT:    .pad #16
-; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    mov r4, sp
-; CHECK-LE-NEXT:    bfc r4, #0, #4
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    vldr d1, [r7, #8]
-; CHECK-LE-NEXT:    sub.w r4, r7, #8
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    uxth r2, r1
-; CHECK-LE-NEXT:    lsls r1, r1, #31
-; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-LE-NEXT:    strbne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-LE-NEXT:    lsls r1, r2, #23
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #22
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-LE-NEXT:    lsls r1, r2, #21
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #20
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-LE-NEXT:    lsls r1, r2, #19
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #18
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-LE-NEXT:    lsls r3, r2, #17
-; CHECK-LE-NEXT:    add.w r1, r0, #4
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r3, q0[14]
-; CHECK-LE-NEXT:    strbmi r3, [r0, #14]
-; CHECK-LE-NEXT:    lsls r2, r2, #16
-; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[15]
-; CHECK-LE-NEXT:    strbmi r2, [r0, #15]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
-; CHECK-BE-NEXT:    .setfp r7, sp, #8
-; CHECK-BE-NEXT:    add r7, sp, #8
-; CHECK-BE-NEXT:    .pad #16
-; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    mov r4, sp
-; CHECK-BE-NEXT:    bfc r4, #0, #4
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    vldr d1, [r7, #8]
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    uxth r2, r1
-; CHECK-BE-NEXT:    lsls r1, r1, #31
-; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-BE-NEXT:    strbne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-BE-NEXT:    lsls r1, r2, #23
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #22
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-BE-NEXT:    lsls r1, r2, #21
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #20
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-BE-NEXT:    lsls r1, r2, #19
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #18
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-BE-NEXT:    lsls r3, r2, #17
-; CHECK-BE-NEXT:    add.w r1, r0, #4
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r3, q0[14]
-; CHECK-BE-NEXT:    strbmi r3, [r0, #14]
-; CHECK-BE-NEXT:    lsls r2, r2, #16
-; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[15]
-; CHECK-BE-NEXT:    strbmi r2, [r0, #15]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    mov sp, r4
-; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-BE-NEXT:    vrev64.8 q2, q0
+; CHECK-BE-NEXT:    vcmp.s8 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <16 x i8>*
@@ -1503,74 +313,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
 ; CHECK-LE-LABEL: masked_v4f32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #4
-; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #4
-; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s7, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -1581,90 +335,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
 ; CHECK-LE-LABEL: masked_v4f32_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #20
-; CHECK-LE-NEXT:    sub sp, #20
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmrs r2, p0
-; CHECK-LE-NEXT:    and r3, r2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    and r1, r1, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    ittt ne
-; CHECK-LE-NEXT:    vstrne s0, [sp, #12]
-; CHECK-LE-NEXT:    ldrne r2, [sp, #12]
-; CHECK-LE-NEXT:    strne r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s1, [sp, #8]
-; CHECK-LE-NEXT:    ldrmi r2, [sp, #8]
-; CHECK-LE-NEXT:    strmi r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s2, [sp, #4]
-; CHECK-LE-NEXT:    ldrmi r2, [sp, #4]
-; CHECK-LE-NEXT:    strmi r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    ittt mi
-; CHECK-LE-NEXT:    vstrmi s3, [sp]
-; CHECK-LE-NEXT:    ldrmi r1, [sp]
-; CHECK-LE-NEXT:    strmi r1, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #20
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #20
-; CHECK-BE-NEXT:    sub sp, #20
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    and r1, r1, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    ittt ne
-; CHECK-BE-NEXT:    vstrne s4, [sp, #12]
-; CHECK-BE-NEXT:    ldrne r2, [sp, #12]
-; CHECK-BE-NEXT:    strne r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s5, [sp, #8]
-; CHECK-BE-NEXT:    ldrmi r2, [sp, #8]
-; CHECK-BE-NEXT:    strmi r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s6, [sp, #4]
-; CHECK-BE-NEXT:    ldrmi r2, [sp, #4]
-; CHECK-BE-NEXT:    strmi r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    ittt mi
-; CHECK-BE-NEXT:    vstrmi s7, [sp]
-; CHECK-BE-NEXT:    ldrmi r1, [sp]
-; CHECK-BE-NEXT:    strmi r1, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #20
+; CHECK-BE-NEXT:    vrev32.8 q0, q1
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -1675,81 +358,25 @@ entry:
 define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r1, r2, #15
-; CHECK-LE-NEXT:    lsls r2, r1, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r1, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r1, r2, #15
-; CHECK-BE-NEXT:    lsls r2, r1, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r1, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1764,83 +391,25 @@ entry:
 define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    and r2, r2, #15
-; CHECK-LE-NEXT:    lsls r1, r2, #31
-; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #29
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r2, #28
-; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    and r2, r2, #15
-; CHECK-BE-NEXT:    lsls r1, r2, #31
-; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    vstrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #29
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r2, #28
-; CHECK-BE-NEXT:    it mi
-; CHECK-BE-NEXT:    vstrmi s3, [r0, #12]
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1856,194 +425,18 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
 ; CHECK-LE-LABEL: masked_v8f16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB15_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB15_10
-; CHECK-LE-NEXT:  .LBB15_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB15_11
-; CHECK-LE-NEXT:  .LBB15_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB15_12
-; CHECK-LE-NEXT:  .LBB15_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB15_13
-; CHECK-LE-NEXT:  .LBB15_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB15_14
-; CHECK-LE-NEXT:  .LBB15_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB15_15
-; CHECK-LE-NEXT:  .LBB15_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB15_16
-; CHECK-LE-NEXT:  .LBB15_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB15_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB15_2
-; CHECK-LE-NEXT:  .LBB15_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB15_3
-; CHECK-LE-NEXT:  .LBB15_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB15_4
-; CHECK-LE-NEXT:  .LBB15_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB15_5
-; CHECK-LE-NEXT:  .LBB15_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB15_6
-; CHECK-LE-NEXT:  .LBB15_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB15_7
-; CHECK-LE-NEXT:  .LBB15_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB15_8
-; CHECK-LE-NEXT:  .LBB15_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB15_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB15_10
-; CHECK-BE-NEXT:  .LBB15_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB15_11
-; CHECK-BE-NEXT:  .LBB15_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB15_12
-; CHECK-BE-NEXT:  .LBB15_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB15_13
-; CHECK-BE-NEXT:  .LBB15_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB15_14
-; CHECK-BE-NEXT:  .LBB15_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB15_15
-; CHECK-BE-NEXT:  .LBB15_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB15_16
-; CHECK-BE-NEXT:  .LBB15_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #8
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB15_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s4, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB15_2
-; CHECK-BE-NEXT:  .LBB15_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s0, s4
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB15_3
-; CHECK-BE-NEXT:  .LBB15_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s5, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB15_4
-; CHECK-BE-NEXT:  .LBB15_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s0, s5
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB15_5
-; CHECK-BE-NEXT:  .LBB15_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s6, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB15_6
-; CHECK-BE-NEXT:  .LBB15_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s0, s6
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB15_7
-; CHECK-BE-NEXT:  .LBB15_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s7, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB15_8
-; CHECK-BE-NEXT:  .LBB15_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s7
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <8 x i16> %b, zeroinitializer
@@ -2054,226 +447,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
 ; CHECK-LE-LABEL: masked_v8f16_align1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #40
-; CHECK-LE-NEXT:    sub sp, #40
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmrs r1, p0
-; CHECK-LE-NEXT:    and r3, r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB16_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB16_10
-; CHECK-LE-NEXT:  .LBB16_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB16_11
-; CHECK-LE-NEXT:  .LBB16_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB16_12
-; CHECK-LE-NEXT:  .LBB16_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB16_13
-; CHECK-LE-NEXT:  .LBB16_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB16_14
-; CHECK-LE-NEXT:  .LBB16_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB16_15
-; CHECK-LE-NEXT:  .LBB16_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB16_16
-; CHECK-LE-NEXT:  .LBB16_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #40
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB16_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [sp, #28]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #28]
-; CHECK-LE-NEXT:    strh r2, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB16_2
-; CHECK-LE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #24]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #24]
-; CHECK-LE-NEXT:    strh r2, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB16_3
-; CHECK-LE-NEXT:  .LBB16_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [sp, #20]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #20]
-; CHECK-LE-NEXT:    strh r2, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB16_4
-; CHECK-LE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #16]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #16]
-; CHECK-LE-NEXT:    strh r2, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB16_5
-; CHECK-LE-NEXT:  .LBB16_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [sp, #12]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #12]
-; CHECK-LE-NEXT:    strh r2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB16_6
-; CHECK-LE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #8]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #8]
-; CHECK-LE-NEXT:    strh r2, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB16_7
-; CHECK-LE-NEXT:  .LBB16_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [sp, #4]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp, #4]
-; CHECK-LE-NEXT:    strh r2, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB16_8
-; CHECK-LE-NEXT:  .LBB16_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [sp]
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    strh r1, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #40
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_align1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #40
-; CHECK-BE-NEXT:    sub sp, #40
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vrev16.8 q0, q2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r3, r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    rsbs r1, r1, #0
-; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB16_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB16_10
-; CHECK-BE-NEXT:  .LBB16_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB16_11
-; CHECK-BE-NEXT:  .LBB16_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB16_12
-; CHECK-BE-NEXT:  .LBB16_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB16_13
-; CHECK-BE-NEXT:  .LBB16_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB16_14
-; CHECK-BE-NEXT:  .LBB16_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB16_15
-; CHECK-BE-NEXT:  .LBB16_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB16_16
-; CHECK-BE-NEXT:  .LBB16_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #40
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB16_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s4, [sp, #28]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #28]
-; CHECK-BE-NEXT:    strh r2, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB16_2
-; CHECK-BE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s0, s4
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #24]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #24]
-; CHECK-BE-NEXT:    strh r2, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB16_3
-; CHECK-BE-NEXT:  .LBB16_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s5, [sp, #20]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #20]
-; CHECK-BE-NEXT:    strh r2, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB16_4
-; CHECK-BE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s0, s5
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #16]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #16]
-; CHECK-BE-NEXT:    strh r2, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB16_5
-; CHECK-BE-NEXT:  .LBB16_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s6, [sp, #12]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #12]
-; CHECK-BE-NEXT:    strh r2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB16_6
-; CHECK-BE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s0, s6
-; CHECK-BE-NEXT:    vstr.16 s0, [sp, #8]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #8]
-; CHECK-BE-NEXT:    strh r2, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB16_7
-; CHECK-BE-NEXT:  .LBB16_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp, #4]
-; CHECK-BE-NEXT:    strh r2, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB16_8
-; CHECK-BE-NEXT:  .LBB16_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s7
-; CHECK-BE-NEXT:    vstr.16 s0, [sp]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    strh r1, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #40
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrbt.8 q0, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp ugt <8 x i16> %b, zeroinitializer
@@ -2284,201 +470,25 @@ entry:
 define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r3, r12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-LE-NEXT:    uxtb r1, r2
-; CHECK-LE-NEXT:    lsls r2, r2, #31
-; CHECK-LE-NEXT:    bne .LBB17_9
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bmi .LBB17_10
-; CHECK-LE-NEXT:  .LBB17_2: @ %else2
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bmi .LBB17_11
-; CHECK-LE-NEXT:  .LBB17_3: @ %else4
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bmi .LBB17_12
-; CHECK-LE-NEXT:  .LBB17_4: @ %else6
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bmi .LBB17_13
-; CHECK-LE-NEXT:  .LBB17_5: @ %else8
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bmi .LBB17_14
-; CHECK-LE-NEXT:  .LBB17_6: @ %else10
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bmi .LBB17_15
-; CHECK-LE-NEXT:  .LBB17_7: @ %else12
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bmi .LBB17_16
-; CHECK-LE-NEXT:  .LBB17_8: @ %else14
-; CHECK-LE-NEXT:    add sp, #8
-; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB17_9: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r2, r1, #30
-; CHECK-LE-NEXT:    bpl .LBB17_2
-; CHECK-LE-NEXT:  .LBB17_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r2, r1, #29
-; CHECK-LE-NEXT:    bpl .LBB17_3
-; CHECK-LE-NEXT:  .LBB17_11: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r2, r1, #28
-; CHECK-LE-NEXT:    bpl .LBB17_4
-; CHECK-LE-NEXT:  .LBB17_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r2, r1, #27
-; CHECK-LE-NEXT:    bpl .LBB17_5
-; CHECK-LE-NEXT:  .LBB17_13: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r2, r1, #26
-; CHECK-LE-NEXT:    bpl .LBB17_6
-; CHECK-LE-NEXT:  .LBB17_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:    lsls r2, r1, #25
-; CHECK-LE-NEXT:    bpl .LBB17_7
-; CHECK-LE-NEXT:  .LBB17_15: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r1, #24
-; CHECK-LE-NEXT:    bpl .LBB17_8
-; CHECK-LE-NEXT:  .LBB17_16: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r3, r12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
-; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
-; CHECK-BE-NEXT:    uxtb r1, r2
-; CHECK-BE-NEXT:    lsls r2, r2, #31
-; CHECK-BE-NEXT:    bne .LBB17_9
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bmi .LBB17_10
-; CHECK-BE-NEXT:  .LBB17_2: @ %else2
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bmi .LBB17_11
-; CHECK-BE-NEXT:  .LBB17_3: @ %else4
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bmi .LBB17_12
-; CHECK-BE-NEXT:  .LBB17_4: @ %else6
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bmi .LBB17_13
-; CHECK-BE-NEXT:  .LBB17_5: @ %else8
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bmi .LBB17_14
-; CHECK-BE-NEXT:  .LBB17_6: @ %else10
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bmi .LBB17_15
-; CHECK-BE-NEXT:  .LBB17_7: @ %else12
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bmi .LBB17_16
-; CHECK-BE-NEXT:  .LBB17_8: @ %else14
-; CHECK-BE-NEXT:    add sp, #8
-; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB17_9: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r2, r1, #30
-; CHECK-BE-NEXT:    bpl .LBB17_2
-; CHECK-BE-NEXT:  .LBB17_10: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    lsls r2, r1, #29
-; CHECK-BE-NEXT:    bpl .LBB17_3
-; CHECK-BE-NEXT:  .LBB17_11: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r2, r1, #28
-; CHECK-BE-NEXT:    bpl .LBB17_4
-; CHECK-BE-NEXT:  .LBB17_12: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    lsls r2, r1, #27
-; CHECK-BE-NEXT:    bpl .LBB17_5
-; CHECK-BE-NEXT:  .LBB17_13: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r2, r1, #26
-; CHECK-BE-NEXT:    bpl .LBB17_6
-; CHECK-BE-NEXT:  .LBB17_14: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s4, s2
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:    lsls r2, r1, #25
-; CHECK-BE-NEXT:    bpl .LBB17_7
-; CHECK-BE-NEXT:  .LBB17_15: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r1, #24
-; CHECK-BE-NEXT:    bpl .LBB17_8
-; CHECK-BE-NEXT:  .LBB17_16: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s3
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2493,194 +503,26 @@ entry:
 define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .pad #8
-; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    vldr d1, [sp, #8]
+; CHECK-LE-NEXT:    vldr d1, [sp]
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vmrs r12, p0
-; CHECK-LE-NEXT:    and r2, r12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-LE-NEXT:    uxtb r2, r3
-; CHECK-LE-NEXT:    lsls r1, r3, #31
-; CHECK-LE-NEXT:    bne .LBB18_12
-; CHECK-LE-NEXT:  @ %bb.1: @ %else
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    bmi .LBB18_13
-; CHECK-LE-NEXT:  .LBB18_2: @ %else2
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    bmi .LBB18_14
-; CHECK-LE-NEXT:  .LBB18_3: @ %else4
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    bmi .LBB18_15
-; CHECK-LE-NEXT:  .LBB18_4: @ %else6
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    bmi .LBB18_16
-; CHECK-LE-NEXT:  .LBB18_5: @ %else8
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    bpl .LBB18_7
-; CHECK-LE-NEXT:  .LBB18_6: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-LE-NEXT:  .LBB18_7: @ %else10
-; CHECK-LE-NEXT:    adds r1, r0, #4
-; CHECK-LE-NEXT:    lsls r3, r2, #25
-; CHECK-LE-NEXT:    bpl .LBB18_9
-; CHECK-LE-NEXT:  @ %bb.8: @ %cond.store11
-; CHECK-LE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-LE-NEXT:  .LBB18_9: @ %else12
-; CHECK-LE-NEXT:    lsls r2, r2, #24
-; CHECK-LE-NEXT:    bpl .LBB18_11
-; CHECK-LE-NEXT:  @ %bb.10: @ %cond.store13
-; CHECK-LE-NEXT:    vmovx.f16 s0, s3
-; CHECK-LE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-LE-NEXT:  .LBB18_11: @ %else14
-; CHECK-LE-NEXT:    mov r0, r1
-; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    vpst
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:  .LBB18_12: @ %cond.store
-; CHECK-LE-NEXT:    vstr.16 s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
-; CHECK-LE-NEXT:    bpl .LBB18_2
-; CHECK-LE-NEXT:  .LBB18_13: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
-; CHECK-LE-NEXT:    bpl .LBB18_3
-; CHECK-LE-NEXT:  .LBB18_14: @ %cond.store3
-; CHECK-LE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
-; CHECK-LE-NEXT:    bpl .LBB18_4
-; CHECK-LE-NEXT:  .LBB18_15: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
-; CHECK-LE-NEXT:    bpl .LBB18_5
-; CHECK-LE-NEXT:  .LBB18_16: @ %cond.store7
-; CHECK-LE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
-; CHECK-LE-NEXT:    bmi .LBB18_6
-; CHECK-LE-NEXT:    b .LBB18_7
 ;
 ; CHECK-BE-LABEL: masked_v8f16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .pad #8
-; CHECK-BE-NEXT:    sub sp, #8
-; CHECK-BE-NEXT:    vldr d1, [sp, #8]
+; CHECK-BE-NEXT:    vldr d1, [sp]
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    movs r3, #0
-; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vmrs r12, p0
-; CHECK-BE-NEXT:    and r2, r12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
-; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
-; CHECK-BE-NEXT:    uxtb r2, r3
-; CHECK-BE-NEXT:    lsls r1, r3, #31
-; CHECK-BE-NEXT:    bne .LBB18_12
-; CHECK-BE-NEXT:  @ %bb.1: @ %else
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    bmi .LBB18_13
-; CHECK-BE-NEXT:  .LBB18_2: @ %else2
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    bmi .LBB18_14
-; CHECK-BE-NEXT:  .LBB18_3: @ %else4
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    bmi .LBB18_15
-; CHECK-BE-NEXT:  .LBB18_4: @ %else6
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    bmi .LBB18_16
-; CHECK-BE-NEXT:  .LBB18_5: @ %else8
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    bpl .LBB18_7
-; CHECK-BE-NEXT:  .LBB18_6: @ %cond.store9
-; CHECK-BE-NEXT:    vmovx.f16 s4, s2
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #10]
-; CHECK-BE-NEXT:  .LBB18_7: @ %else10
-; CHECK-BE-NEXT:    adds r1, r0, #4
-; CHECK-BE-NEXT:    lsls r3, r2, #25
-; CHECK-BE-NEXT:    bpl .LBB18_9
-; CHECK-BE-NEXT:  @ %bb.8: @ %cond.store11
-; CHECK-BE-NEXT:    vstr.16 s3, [r0, #12]
-; CHECK-BE-NEXT:  .LBB18_9: @ %else12
-; CHECK-BE-NEXT:    lsls r2, r2, #24
-; CHECK-BE-NEXT:    bpl .LBB18_11
-; CHECK-BE-NEXT:  @ %bb.10: @ %cond.store13
-; CHECK-BE-NEXT:    vmovx.f16 s0, s3
-; CHECK-BE-NEXT:    vstr.16 s0, [r0, #14]
-; CHECK-BE-NEXT:  .LBB18_11: @ %else14
-; CHECK-BE-NEXT:    mov r0, r1
-; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q2, q0
+; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
+; CHECK-BE-NEXT:    vpst
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
+; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:  .LBB18_12: @ %cond.store
-; CHECK-BE-NEXT:    vstr.16 s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
-; CHECK-BE-NEXT:    bpl .LBB18_2
-; CHECK-BE-NEXT:  .LBB18_13: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
-; CHECK-BE-NEXT:    bpl .LBB18_3
-; CHECK-BE-NEXT:  .LBB18_14: @ %cond.store3
-; CHECK-BE-NEXT:    vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
-; CHECK-BE-NEXT:    bpl .LBB18_4
-; CHECK-BE-NEXT:  .LBB18_15: @ %cond.store5
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
-; CHECK-BE-NEXT:    bpl .LBB18_5
-; CHECK-BE-NEXT:  .LBB18_16: @ %cond.store7
-; CHECK-BE-NEXT:    vstr.16 s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
-; CHECK-BE-NEXT:    bmi .LBB18_6
-; CHECK-BE-NEXT:    b .LBB18_7
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
   %0 = bitcast i8* %x to <8 x half>*

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll?rev=371932&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll Sun Sep 15 07:14:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt -loop-vectorize -enable-arm-maskedldst < %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1-m.main-none-eabi"
+
+; CHECK-LABEL: test
+; CHECK: llvm.masked.store.v4i32.p0v4i32
+define void @test(i32* nocapture %A, i32 %n) #0 {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013
+  %0 = load i32, i32* %arrayidx, align 4
+  %.off = add i32 %0, 9
+  %1 = icmp ult i32 %.off, 19
+  br i1 %1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { "target-features"="+mve" }




More information about the llvm-commits mailing list