[llvm] r371419 - [ARM] Fix loads and stores for predicate vectors

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 9 09:35:49 PDT 2019


Author: dmgreen
Date: Mon Sep  9 09:35:49 2019
New Revision: 371419

URL: http://llvm.org/viewvc/llvm-project?rev=371419&view=rev
Log:
[ARM] Fix loads and stores for predicate vectors

These predicate vectors can usually be loaded and stored with a single
instruction, a VSTR_P0. However this instruction will store the entire P0
predicate, 16 bits, zeroextended to 32bits. Each lane of the the
v4i1/v8i1/v16i1 representing 4/2/1 bits.

As far as I understand, when llvm says "store this v4i1", it really does need
to store 4 bits (or 8, that being the size of a byte, with this bottom 4 as the
interesting bits). For example a bitcast from a v8i1 to a i8 is defined as a
store followed by a load, which is how the code is expanded.

So this instead lowers the v4i1/v8i1 load/store through some shuffles to get
the bits into the correct positions. This, as you might imagine, is not as
efficient as a single instruction. But I believe it is needed for correctness.
v16i1 equally should not load/store 32bits, only storing the 16bits of data.
Stack loads/stores are still using the VSTR_P0 (as can be seen by the test not
changing). This is fine as they are self-consistent, it is only "externally
observable loads/stores" (from our point of view) that need to be corrected.

Differential revision: https://reviews.llvm.org/D67085

Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-pred-bitcast.ll
    llvm/trunk/test/CodeGen/Thumb2/mve-pred-loadstore.ll

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Mon Sep  9 09:35:49 2019
@@ -378,6 +378,8 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+    setOperationAction(ISD::LOAD, VT, Custom);
+    setOperationAction(ISD::STORE, VT, Custom);
   }
 }
 
@@ -8783,6 +8785,65 @@ void ARMTargetLowering::ExpandDIV_Window
   Results.push_back(Upper);
 }
 
+static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
+  LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+  EVT MemVT = LD->getMemoryVT();
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+         "Expected a predicate type!");
+  assert(MemVT == Op.getValueType());
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Expected a non-extending load");
+  assert(LD->isUnindexed() && "Expected a unindexed load");
+
+  // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
+  // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
+  // need to make sure that 8/4 bits are actually loaded into the correct
+  // place, which means loading the value and then shuffling the values into
+  // the bottom bits of the predicate.
+  // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
+  // for BE).
+
+  SDLoc dl(Op);
+  SDValue Load = DAG.getExtLoad(
+      ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
+      EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+      LD->getMemOperand());
+  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+  if (MemVT != MVT::v16i1)
+    Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
+                       DAG.getConstant(0, dl, MVT::i32));
+  return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
+}
+
+static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+  EVT MemVT = ST->getMemoryVT();
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+         "Expected a predicate type!");
+  assert(MemVT == ST->getValue().getValueType());
+  assert(!ST->isTruncatingStore() && "Expected a non-extending store");
+  assert(ST->isUnindexed() && "Expected a unindexed store");
+
+  // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
+  // unset and a scalar store.
+  SDLoc dl(Op);
+  SDValue Build = ST->getValue();
+  if (MemVT != MVT::v16i1) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
+                                DAG.getConstant(I, dl, MVT::i32)));
+    for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
+      Ops.push_back(DAG.getUNDEF(MVT::i32));
+    Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
+  }
+  SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+  return DAG.getTruncStore(
+      ST->getChain(), dl, GRP, ST->getBasePtr(),
+      EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+      ST->getMemOperand());
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -8982,6 +9043,10 @@ SDValue ARMTargetLowering::LowerOperatio
   case ISD::UADDO:
   case ISD::USUBO:
     return LowerUnsignedALUO(Op, DAG);
+  case ISD::LOAD:
+    return LowerPredicateLoad(Op, DAG);
+  case ISD::STORE:
+    return LowerPredicateStore(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);

Modified: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrMVE.td?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td Mon Sep  9 09:35:49 2019
@@ -4999,24 +4999,6 @@ let Predicates = [HasMVEInt, IsBE] in {
   def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
 }
 
-let Predicates = [HasMVEInt] in {
-  // Predicate loads
-  def  : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
-             (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
-  def  : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
-             (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
-  def  : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
-             (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
-
-  // Predicate stores
-  def  : Pat<(store (v4i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
-             (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
-  def  : Pat<(store (v8i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
-             (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
-  def  : Pat<(store (v16i1 VCCR:$val), t2addrmode_imm7<2>:$addr),
-             (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>;
-}
-
 
 // Widening/Narrowing Loads/Stores
 

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll Mon Sep  9 09:35:49 2019
@@ -8,11 +8,23 @@ define void @foo_v4i32_v4i32(<4 x i32> *
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrne r3, [r2]
@@ -29,9 +41,21 @@ define void @foo_v4i32_v4i32(<4 x i32> *
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrmi r1, [r2, #12]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -64,11 +88,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -85,11 +121,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -123,11 +171,23 @@ define void @foo_sext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
@@ -144,10 +204,22 @@ define void @foo_sext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -181,12 +253,24 @@ define void @foo_zext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
@@ -203,10 +287,22 @@ define void @foo_zext_v4i32_v4i8(<4 x i3
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #3]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -240,11 +336,23 @@ define void @foo_zext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
@@ -261,10 +369,22 @@ define void @foo_zext_v4i32_v4i16(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #6]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -298,12 +418,36 @@ define void @foo_v8i16_v8i16(<8 x i16> *
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #2, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #4, #1
+; CHECK-NEXT:    ubfx r1, r12, #10, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #5, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #6, #1
+; CHECK-NEXT:    ubfx r1, r12, #14, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
 ; CHECK-NEXT:    vmovne.16 q0[0], r3
@@ -335,10 +479,34 @@ define void @foo_v8i16_v8i16(<8 x i16> *
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #14]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r3, r1, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #3, #1
+; CHECK-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #4, #1
+; CHECK-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #5, #1
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r2, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-NEXT:    strhne r2, [r0]
@@ -386,12 +554,36 @@ define void @foo_sext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #2, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #4, #1
+; CHECK-NEXT:    ubfx r1, r12, #10, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #5, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #6, #1
+; CHECK-NEXT:    ubfx r1, r12, #14, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
 ; CHECK-NEXT:    vmovne.16 q0[0], r3
@@ -423,11 +615,35 @@ define void @foo_sext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    and r3, r1, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #3, #1
+; CHECK-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #4, #1
+; CHECK-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #5, #1
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r2, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-NEXT:    strhne r2, [r0]
@@ -476,12 +692,36 @@ define void @foo_zext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #2, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #4, #1
+; CHECK-NEXT:    ubfx r1, r12, #10, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #5, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #6, #1
+; CHECK-NEXT:    ubfx r1, r12, #14, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
 ; CHECK-NEXT:    vmovne.16 q0[0], r3
@@ -513,11 +753,35 @@ define void @foo_zext_v8i16_v8i8(<8 x i1
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #7]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    and r3, r1, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #3, #1
+; CHECK-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #4, #1
+; CHECK-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #5, #1
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r2, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-NEXT:    strhne r2, [r0]
@@ -573,13 +837,12 @@ define void @foo_v16i8_v16i8(<16 x i8> *
 ; CHECK-NEXT:    bfc r4, #0, #4
 ; CHECK-NEXT:    mov sp, r4
 ; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #16
 ; CHECK-NEXT:    sub.w r4, r7, #8
 ; CHECK-NEXT:    vcmp.s8 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrh.w r1, [sp, #16]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r3, p0
+; CHECK-NEXT:    uxth r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrbne r3, [r2]
 ; CHECK-NEXT:    vmovne.8 q0[0], r3
@@ -643,10 +906,9 @@ define void @foo_v16i8_v16i8(<16 x i8> *
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrbmi r1, [r2, #15]
 ; CHECK-NEXT:    vmovmi.8 q0[15], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrh.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    uxth r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u8 r2, q0[0]
 ; CHECK-NEXT:    strbne r2, [r0]
@@ -726,12 +988,36 @@ define void @foo_trunc_v8i8_v8i16(<8 x i
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #2, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #4, #1
+; CHECK-NEXT:    ubfx r1, r12, #10, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #5, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #6, #1
+; CHECK-NEXT:    ubfx r1, r12, #14, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrhne r3, [r2]
 ; CHECK-NEXT:    vmovne.16 q0[0], r3
@@ -763,10 +1049,34 @@ define void @foo_trunc_v8i8_v8i16(<8 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrhmi r1, [r2, #14]
 ; CHECK-NEXT:    vmovmi.16 q0[7], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r3, r1, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #3, #1
+; CHECK-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #4, #1
+; CHECK-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #5, #1
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r2, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-NEXT:    strbne r2, [r0]
@@ -815,11 +1125,23 @@ define void @foo_trunc_v4i8_v4i32(<4 x i
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrne r3, [r2]
@@ -836,9 +1158,21 @@ define void @foo_trunc_v4i8_v4i32(<4 x i
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrmi r1, [r2, #12]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -872,11 +1206,23 @@ define void @foo_trunc_v4i16_v4i32(<4 x
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrne r3, [r2]
@@ -893,9 +1239,21 @@ define void @foo_trunc_v4i16_v4i32(<4 x
 ; CHECK-NEXT:    itt mi
 ; CHECK-NEXT:    ldrmi r1, [r2, #12]
 ; CHECK-NEXT:    vmovmi.32 q0[3], r1
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    vmovne r2, s0
@@ -929,11 +1287,23 @@ define void @foo_v4f32_v4f32(<4 x float>
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #4]
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    and r1, r3, #15
 ; CHECK-NEXT:    lsls r3, r1, #31
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    vldrne s0, [r2]
@@ -946,9 +1316,21 @@ define void @foo_v4f32_v4f32(<4 x float>
 ; CHECK-NEXT:    lsls r1, r1, #28
 ; CHECK-NEXT:    it mi
 ; CHECK-NEXT:    vldrmi s3, [r2, #12]
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    bfi r1, r2, #3, #1
+; CHECK-NEXT:    and r1, r1, #15
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    vstrne s0, [r0]
@@ -977,12 +1359,36 @@ define void @foo_v8f16_v8f16(<8 x half>
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    add r3, sp, #8
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vstr p0, [r3]
-; CHECK-NEXT:    ldrb.w r1, [sp, #8]
-; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r1, r12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #0, #1
+; CHECK-NEXT:    ubfx r1, r12, #2, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #1, #1
+; CHECK-NEXT:    ubfx r1, r12, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #2, #1
+; CHECK-NEXT:    ubfx r1, r12, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #3, #1
+; CHECK-NEXT:    ubfx r1, r12, #8, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #4, #1
+; CHECK-NEXT:    ubfx r1, r12, #10, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #5, #1
+; CHECK-NEXT:    ubfx r1, r12, #12, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #6, #1
+; CHECK-NEXT:    ubfx r1, r12, #14, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r3, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r3
+; CHECK-NEXT:    lsls r3, r3, #31
 ; CHECK-NEXT:    bne .LBB13_18
 ; CHECK-NEXT:  @ %bb.1: @ %else
 ; CHECK-NEXT:    lsls r3, r1, #30
@@ -1010,10 +1416,34 @@ define void @foo_v8f16_v8f16(<8 x half>
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:  .LBB13_9: @ %else20
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstr p0, [r1]
-; CHECK-NEXT:    ldrb.w r1, [sp]
-; CHECK-NEXT:    lsls r2, r1, #31
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    and r3, r1, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #0, #1
+; CHECK-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #3, #1
+; CHECK-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #4, #1
+; CHECK-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #5, #1
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r2, r3, #6, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    bfi r2, r1, #7, #1
+; CHECK-NEXT:    uxtb r1, r2
+; CHECK-NEXT:    lsls r2, r2, #31
 ; CHECK-NEXT:    bne .LBB13_25
 ; CHECK-NEXT:  @ %bb.10: @ %else23
 ; CHECK-NEXT:    lsls r2, r1, #30
@@ -1072,13 +1502,13 @@ define void @foo_v8f16_v8f16(<8 x half>
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov.16 q0[5], r3
 ; CHECK-NEXT:    lsls r3, r1, #25
-; CHECK-NEXT:    bpl .LBB13_7
+; CHECK-NEXT:    bpl.w .LBB13_7
 ; CHECK-NEXT:  .LBB13_24: @ %cond.load16
 ; CHECK-NEXT:    vldr.16 s4, [r2, #12]
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov.16 q0[6], r3
 ; CHECK-NEXT:    lsls r1, r1, #24
-; CHECK-NEXT:    bmi .LBB13_8
+; CHECK-NEXT:    bmi.w .LBB13_8
 ; CHECK-NEXT:    b .LBB13_9
 ; CHECK-NEXT:  .LBB13_25: @ %cond.store
 ; CHECK-NEXT:    vstr.16 s0, [r0]

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll Mon Sep  9 09:35:49 2019
@@ -7,17 +7,29 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    mov.w r12, #0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #3, #1
+; CHECK-LE-NEXT:    and r1, r2, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    beq .LBB0_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldr r3, [r0]
-; CHECK-LE-NEXT:    vdup.32 q0, r2
-; CHECK-LE-NEXT:    vmov.32 q0[0], r3
+; CHECK-LE-NEXT:    ldr r2, [r0]
+; CHECK-LE-NEXT:    vdup.32 q0, r12
+; CHECK-LE-NEXT:    vmov.32 q0[0], r2
 ; CHECK-LE-NEXT:    b .LBB0_3
 ; CHECK-LE-NEXT:  .LBB0_2:
 ; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
@@ -42,17 +54,29 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    mov.w r12, #0
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #3, #1
+; CHECK-BE-NEXT:    and r1, r2, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB0_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldr r3, [r0]
-; CHECK-BE-NEXT:    vdup.32 q1, r2
-; CHECK-BE-NEXT:    vmov.32 q1[0], r3
+; CHECK-BE-NEXT:    ldr r2, [r0]
+; CHECK-BE-NEXT:    vdup.32 q1, r12
+; CHECK-BE-NEXT:    vmov.32 q1[0], r2
 ; CHECK-BE-NEXT:    b .LBB0_3
 ; CHECK-BE-NEXT:  .LBB0_2:
 ; CHECK-BE-NEXT:    vmov.i32 q1, #0x0
@@ -84,10 +108,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r2, [r0]
@@ -112,11 +148,23 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r2, [r0]
@@ -148,10 +196,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r2, [r0]
@@ -176,11 +236,23 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r2, [r0]
@@ -211,10 +283,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r2, [r0]
@@ -239,10 +323,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r2, [r0]
@@ -274,11 +370,23 @@ define arm_aapcs_vfpcc i8* @masked_v4i32
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r2, r2, #15
 ; CHECK-LE-NEXT:    lsls r3, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r3, [r0]
@@ -304,12 +412,24 @@ define arm_aapcs_vfpcc i8* @masked_v4i32
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r2, r2, #15
 ; CHECK-BE-NEXT:    lsls r3, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r3, [r0]
@@ -345,11 +465,23 @@ define arm_aapcs_vfpcc i8* @masked_v4i32
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r3, r2, #15
 ; CHECK-LE-NEXT:    lsls r2, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r2, [r0]
@@ -376,12 +508,24 @@ define arm_aapcs_vfpcc i8* @masked_v4i32
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r3, r2, #15
 ; CHECK-BE-NEXT:    lsls r2, r3, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r2, [r0]
@@ -419,17 +563,41 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    mov.w r12, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r2, r3, #0
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r3, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r3
+; CHECK-LE-NEXT:    lsls r2, r3, #31
 ; CHECK-LE-NEXT:    beq .LBB6_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    ldrh r3, [r0]
-; CHECK-LE-NEXT:    vdup.16 q0, r2
-; CHECK-LE-NEXT:    vmov.16 q0[0], r3
+; CHECK-LE-NEXT:    ldrh r2, [r0]
+; CHECK-LE-NEXT:    vdup.16 q0, r12
+; CHECK-LE-NEXT:    vmov.16 q0[0], r2
 ; CHECK-LE-NEXT:    b .LBB6_3
 ; CHECK-LE-NEXT:  .LBB6_2:
 ; CHECK-LE-NEXT:    vmov.i32 q0, #0x0
@@ -470,17 +638,41 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r2, r3, #0
+; CHECK-BE-NEXT:    movs r3, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r3, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r3
+; CHECK-BE-NEXT:    lsls r2, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB6_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    ldrh r3, [r0]
-; CHECK-BE-NEXT:    vdup.16 q1, r2
-; CHECK-BE-NEXT:    vmov.16 q1[0], r3
+; CHECK-BE-NEXT:    ldrh r2, [r0]
+; CHECK-BE-NEXT:    vdup.16 q1, r12
+; CHECK-BE-NEXT:    vmov.16 q1[0], r2
 ; CHECK-BE-NEXT:    b .LBB6_3
 ; CHECK-BE-NEXT:  .LBB6_2:
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
@@ -529,11 +721,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrhne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
@@ -573,12 +789,36 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrhne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
@@ -625,11 +865,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrhne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
@@ -669,12 +933,36 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrhne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
@@ -720,11 +1008,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrhne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
@@ -764,11 +1076,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrhne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.16 q1[0], r2
@@ -817,12 +1153,36 @@ define i8* @masked_v8i16_preinc(i8* %x,
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
-; CHECK-LE-NEXT:    lsls r3, r2, #31
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r2, r12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-LE-NEXT:    uxtb r2, r3
+; CHECK-LE-NEXT:    lsls r3, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrhne r3, [r0]
 ; CHECK-LE-NEXT:    vmovne.16 q0[0], r3
@@ -865,13 +1225,37 @@ define i8* @masked_v8i16_preinc(i8* %x,
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r2, r12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-BE-NEXT:    uxtb r2, r3
+; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrhne r3, [r0]
 ; CHECK-BE-NEXT:    vmovne.16 q0[0], r3
@@ -922,12 +1306,36 @@ define arm_aapcs_vfpcc i8* @masked_v8i16
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
+; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-LE-NEXT:    uxtb r3, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrhne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.16 q0[0], r2
@@ -969,13 +1377,37 @@ define arm_aapcs_vfpcc i8* @masked_v8i16
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
+; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-BE-NEXT:    uxtb r3, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrhne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.16 q0[0], r2
@@ -1034,11 +1466,10 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-LE-NEXT:    mov r4, sp
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    uxth r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    beq .LBB12_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-LE-NEXT:    movs r2, #0
@@ -1125,11 +1556,10 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    beq .LBB12_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    movs r2, #0
@@ -1224,12 +1654,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    uxth r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrbne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
@@ -1308,13 +1737,12 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrbne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
@@ -1399,12 +1827,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-LE-NEXT:    mov r4, sp
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    uxth r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrbne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
@@ -1483,12 +1910,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrbne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.8 q1[0], r2
@@ -1574,13 +2000,12 @@ define arm_aapcs_vfpcc i8* @masked_v16i8
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
+; CHECK-LE-NEXT:    vmrs r3, p0
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    lsls r3, r2, #31
+; CHECK-LE-NEXT:    uxth r2, r3
+; CHECK-LE-NEXT:    lsls r3, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrbne r3, [r0]
 ; CHECK-LE-NEXT:    vmovne.8 q0[0], r3
@@ -1660,14 +2085,13 @@ define arm_aapcs_vfpcc i8* @masked_v16i8
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
+; CHECK-BE-NEXT:    vmrs r3, p0
+; CHECK-BE-NEXT:    uxth r2, r3
+; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrbne r3, [r0]
 ; CHECK-BE-NEXT:    vmovne.8 q0[0], r3
@@ -1757,13 +2181,12 @@ define arm_aapcs_vfpcc i8* @masked_v16i8
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r3, [sp]
+; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    lsls r2, r3, #31
+; CHECK-LE-NEXT:    uxth r3, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrbne r2, [r0]
 ; CHECK-LE-NEXT:    vmovne.8 q0[0], r2
@@ -1844,14 +2267,13 @@ define arm_aapcs_vfpcc i8* @masked_v16i8
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r2]
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    ldrh.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r3, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrbne r2, [r0]
 ; CHECK-BE-NEXT:    vmovne.8 q0[0], r2
@@ -1935,10 +2357,22 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    beq .LBB17_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
@@ -1972,10 +2406,22 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB17_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
@@ -2016,10 +2462,22 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vldrne s0, [r0]
@@ -2040,11 +2498,23 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne s4, [r0]
@@ -2072,10 +2542,22 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r2, [r0]
@@ -2100,11 +2582,23 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    ldrne r2, [r0]
@@ -2135,10 +2629,22 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vldrne s4, [r0]
@@ -2160,11 +2666,23 @@ define arm_aapcs_vfpcc <4 x float> @mask
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne s8, [r0]
@@ -2192,11 +2710,23 @@ define arm_aapcs_vfpcc i8* @masked_v4f32
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r2, r2, #15
 ; CHECK-LE-NEXT:    lsls r3, r2, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vldrne s0, [r0]
@@ -2218,12 +2748,24 @@ define arm_aapcs_vfpcc i8* @masked_v4f32
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r2, r2, #15
 ; CHECK-BE-NEXT:    lsls r3, r2, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne s0, [r0]
@@ -2255,11 +2797,23 @@ define arm_aapcs_vfpcc i8* @masked_v4f32
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    add.w r12, r0, #4
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r3, r2, #15
 ; CHECK-LE-NEXT:    lsls r2, r3, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vldrne s0, [r0]
@@ -2282,12 +2836,24 @@ define arm_aapcs_vfpcc i8* @masked_v4f32
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    add.w r12, r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r3, r2, #15
 ; CHECK-BE-NEXT:    lsls r2, r3, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne s0, [r0]
@@ -2320,11 +2886,35 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    beq .LBB23_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-LE-NEXT:    vldr.16 s0, .LCPI23_0
@@ -2411,11 +3001,35 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    beq .LBB23_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    vldr.16 s0, .LCPI23_0
@@ -2509,11 +3123,35 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r1, sp
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB24_9
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -2591,12 +3229,36 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB24_10
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -2680,11 +3342,35 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-LE-NEXT:    .pad #40
 ; CHECK-LE-NEXT:    sub sp, #40
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    add r1, sp, #32
-; CHECK-LE-NEXT:    vstr p0, [r1]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB25_9
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -2778,12 +3464,36 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-BE-NEXT:    .pad #40
 ; CHECK-BE-NEXT:    sub sp, #40
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    add r1, sp, #32
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB25_10
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -2882,11 +3592,35 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB26_10
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -2966,12 +3700,36 @@ define arm_aapcs_vfpcc <8 x half> @maske
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB26_10
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -3057,12 +3815,36 @@ define arm_aapcs_vfpcc i8* @masked_v8f16
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    adds r0, #4
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    lsls r3, r2, #31
+; CHECK-LE-NEXT:    and r2, r12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-LE-NEXT:    uxtb r2, r3
+; CHECK-LE-NEXT:    lsls r3, r3, #31
 ; CHECK-LE-NEXT:    bne .LBB27_10
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r3, r2, #30
@@ -3140,13 +3922,37 @@ define arm_aapcs_vfpcc i8* @masked_v8f16
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    adds r0, #4
-; CHECK-BE-NEXT:    vstr p0, [r2]
+; CHECK-BE-NEXT:    vmrs r12, p0
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
-; CHECK-BE-NEXT:    lsls r3, r2, #31
+; CHECK-BE-NEXT:    and r2, r12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-BE-NEXT:    uxtb r2, r3
+; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    bne .LBB27_10
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r3, r2, #30
@@ -3234,11 +4040,35 @@ define arm_aapcs_vfpcc i8* @masked_v8f16
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r12, p0
 ; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    ldrb.w r3, [sp]
-; CHECK-LE-NEXT:    lsls r2, r3, #31
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-LE-NEXT:    uxtb r3, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB28_12
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r3, #30
@@ -3314,12 +4144,36 @@ define arm_aapcs_vfpcc i8* @masked_v8f16
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r3, [sp]
-; CHECK-BE-NEXT:    lsls r2, r3, #31
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-BE-NEXT:    uxtb r3, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB28_12
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r3, #30

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll Mon Sep  9 09:35:49 2019
@@ -7,10 +7,22 @@ define arm_aapcs_vfpcc void @masked_v4i3
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne r2, s0
@@ -35,10 +47,22 @@ define arm_aapcs_vfpcc void @masked_v4i3
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne r2, s4
@@ -68,10 +92,22 @@ define arm_aapcs_vfpcc void @masked_v4i3
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne r2, s0
@@ -96,10 +132,22 @@ define arm_aapcs_vfpcc void @masked_v4i3
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne r2, s4
@@ -132,24 +180,36 @@ define i8* @masked_v4i32_pre(i8* %y, i8*
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r1, r2, #15
+; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r1, s0
-; CHECK-LE-NEXT:    strne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
+; CHECK-LE-NEXT:    vmovne r2, s0
+; CHECK-LE-NEXT:    strne r2, [r0]
+; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s1
-; CHECK-LE-NEXT:    strmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
+; CHECK-LE-NEXT:    vmovmi r2, s1
+; CHECK-LE-NEXT:    strmi r2, [r0, #4]
+; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s2
-; CHECK-LE-NEXT:    strmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
+; CHECK-LE-NEXT:    vmovmi r2, s2
+; CHECK-LE-NEXT:    strmi r2, [r0, #8]
+; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi r1, s3
 ; CHECK-LE-NEXT:    strmi r1, [r0, #12]
@@ -163,25 +223,37 @@ define i8* @masked_v4i32_pre(i8* %y, i8*
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r1, r2, #15
+; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s0
-; CHECK-BE-NEXT:    strne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
+; CHECK-BE-NEXT:    vmovne r2, s0
+; CHECK-BE-NEXT:    strne r2, [r0]
+; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s1
-; CHECK-BE-NEXT:    strmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
+; CHECK-BE-NEXT:    vmovmi r2, s1
+; CHECK-BE-NEXT:    strmi r2, [r0, #4]
+; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r1, s2
-; CHECK-BE-NEXT:    strmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
+; CHECK-BE-NEXT:    vmovmi r2, s2
+; CHECK-BE-NEXT:    strmi r2, [r0, #8]
+; CHECK-BE-NEXT:    lsls r1, r1, #28
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi r1, s3
 ; CHECK-BE-NEXT:    strmi r1, [r0, #12]
@@ -204,11 +276,23 @@ define i8* @masked_v4i32_post(i8* %y, i8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r2, r2, #15
 ; CHECK-LE-NEXT:    lsls r1, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne r1, s0
@@ -236,12 +320,24 @@ define i8* @masked_v4i32_post(i8* %y, i8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r2, r2, #15
 ; CHECK-BE-NEXT:    lsls r1, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne r1, s0
@@ -278,11 +374,35 @@ define arm_aapcs_vfpcc void @masked_v8i1
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-LE-NEXT:    strhne r2, [r0]
@@ -322,11 +442,35 @@ define arm_aapcs_vfpcc void @masked_v8i1
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
 ; CHECK-BE-NEXT:    strhne r2, [r0]
@@ -371,11 +515,35 @@ define arm_aapcs_vfpcc void @masked_v8i1
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
 ; CHECK-LE-NEXT:    strhne r2, [r0]
@@ -415,11 +583,35 @@ define arm_aapcs_vfpcc void @masked_v8i1
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne.u16 r2, q1[0]
 ; CHECK-BE-NEXT:    strhne r2, [r0]
@@ -467,40 +659,64 @@ define i8* @masked_v8i16_pre(i8* %y, i8*
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-LE-NEXT:    strhne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
+; CHECK-LE-NEXT:    vmovne.u16 r2, q0[0]
+; CHECK-LE-NEXT:    strhne r2, [r0]
+; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[1]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #2]
+; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[2]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #4]
+; CHECK-LE-NEXT:    lsls r2, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[3]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #6]
+; CHECK-LE-NEXT:    lsls r2, r1, #27
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[4]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #8]
+; CHECK-LE-NEXT:    lsls r2, r1, #26
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[5]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #10]
+; CHECK-LE-NEXT:    lsls r2, r1, #25
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[6]
-; CHECK-LE-NEXT:    strhmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
+; CHECK-LE-NEXT:    vmovmi.u16 r2, q0[6]
+; CHECK-LE-NEXT:    strhmi r2, [r0, #12]
+; CHECK-LE-NEXT:    lsls r1, r1, #24
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[7]
 ; CHECK-LE-NEXT:    strhmi r1, [r0, #14]
@@ -514,41 +730,65 @@ define i8* @masked_v8i16_pre(i8* %y, i8*
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u16 r1, q0[0]
-; CHECK-BE-NEXT:    strhne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
+; CHECK-BE-NEXT:    vmovne.u16 r2, q0[0]
+; CHECK-BE-NEXT:    strhne r2, [r0]
+; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[1]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[1]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #2]
+; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[2]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[2]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #4]
+; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[3]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[3]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #6]
+; CHECK-BE-NEXT:    lsls r2, r1, #27
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[4]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[4]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #8]
+; CHECK-BE-NEXT:    lsls r2, r1, #26
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[5]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[5]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #10]
+; CHECK-BE-NEXT:    lsls r2, r1, #25
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[6]
-; CHECK-BE-NEXT:    strhmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
+; CHECK-BE-NEXT:    vmovmi.u16 r2, q0[6]
+; CHECK-BE-NEXT:    strhmi r2, [r0, #12]
+; CHECK-BE-NEXT:    lsls r1, r1, #24
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[7]
 ; CHECK-BE-NEXT:    strhmi r1, [r0, #14]
@@ -571,12 +811,36 @@ define i8* @masked_v8i16_post(i8* %y, i8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r2, r12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-LE-NEXT:    lsls r1, r3, #31
+; CHECK-LE-NEXT:    uxtb r2, r3
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne.u16 r1, q0[0]
 ; CHECK-LE-NEXT:    strhne r1, [r0]
@@ -600,8 +864,8 @@ define i8* @masked_v8i16_post(i8* %y, i8
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u16 r1, q0[5]
 ; CHECK-LE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-LE-NEXT:    adds r1, r0, #4
 ; CHECK-LE-NEXT:    lsls r3, r2, #25
+; CHECK-LE-NEXT:    add.w r1, r0, #4
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u16 r3, q0[6]
 ; CHECK-LE-NEXT:    strhmi r3, [r0, #12]
@@ -619,13 +883,37 @@ define i8* @masked_v8i16_post(i8* %y, i8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r2, r12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-BE-NEXT:    lsls r1, r3, #31
+; CHECK-BE-NEXT:    uxtb r2, r3
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne.u16 r1, q0[0]
 ; CHECK-BE-NEXT:    strhne r1, [r0]
@@ -649,8 +937,8 @@ define i8* @masked_v8i16_post(i8* %y, i8
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u16 r1, q0[5]
 ; CHECK-BE-NEXT:    strhmi r1, [r0, #10]
-; CHECK-BE-NEXT:    adds r1, r0, #4
 ; CHECK-BE-NEXT:    lsls r3, r2, #25
+; CHECK-BE-NEXT:    add.w r1, r0, #4
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u16 r3, q0[6]
 ; CHECK-BE-NEXT:    strhmi r3, [r0, #12]
@@ -684,12 +972,11 @@ define arm_aapcs_vfpcc void @masked_v16i
 ; CHECK-LE-NEXT:    mov r4, sp
 ; CHECK-LE-NEXT:    bfc r4, #0, #4
 ; CHECK-LE-NEXT:    mov sp, r4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    ldrh.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    uxth r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne.u8 r2, q0[0]
 ; CHECK-LE-NEXT:    strbne r2, [r0]
@@ -768,12 +1055,11 @@ define arm_aapcs_vfpcc void @masked_v16i
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    mov r1, sp
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrh.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne.u8 r2, q1[0]
 ; CHECK-BE-NEXT:    strbne r2, [r0]
@@ -860,73 +1146,72 @@ define i8* @masked_v16i8_pre(i8* %y, i8*
 ; CHECK-LE-NEXT:    vldr d1, [r7, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
-; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
+; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    uxth r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-LE-NEXT:    strbne r1, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
+; CHECK-LE-NEXT:    vmovne.u8 r2, q0[0]
+; CHECK-LE-NEXT:    strbne r2, [r0]
+; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[1]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #1]
+; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[2]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #2]
+; CHECK-LE-NEXT:    lsls r2, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-LE-NEXT:    lsls r1, r2, #27
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[3]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #3]
+; CHECK-LE-NEXT:    lsls r2, r1, #27
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #26
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[4]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #4]
+; CHECK-LE-NEXT:    lsls r2, r1, #26
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-LE-NEXT:    lsls r1, r2, #25
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[5]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #5]
+; CHECK-LE-NEXT:    lsls r2, r1, #25
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-LE-NEXT:    lsls r1, r2, #24
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[6]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #6]
+; CHECK-LE-NEXT:    lsls r2, r1, #24
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-LE-NEXT:    lsls r1, r2, #23
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[7]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #7]
+; CHECK-LE-NEXT:    lsls r2, r1, #23
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #22
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[8]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #8]
+; CHECK-LE-NEXT:    lsls r2, r1, #22
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-LE-NEXT:    lsls r1, r2, #21
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[9]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #9]
+; CHECK-LE-NEXT:    lsls r2, r1, #21
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-LE-NEXT:    lsls r1, r2, #20
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[10]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #10]
+; CHECK-LE-NEXT:    lsls r2, r1, #20
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-LE-NEXT:    lsls r1, r2, #19
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[11]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #11]
+; CHECK-LE-NEXT:    lsls r2, r1, #19
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-LE-NEXT:    lsls r1, r2, #18
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[12]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #12]
+; CHECK-LE-NEXT:    lsls r2, r1, #18
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-LE-NEXT:    lsls r1, r2, #17
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[13]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #13]
+; CHECK-LE-NEXT:    lsls r2, r1, #17
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[14]
-; CHECK-LE-NEXT:    strbmi r1, [r0, #14]
-; CHECK-LE-NEXT:    lsls r1, r2, #16
+; CHECK-LE-NEXT:    vmovmi.u8 r2, q0[14]
+; CHECK-LE-NEXT:    strbmi r2, [r0, #14]
+; CHECK-LE-NEXT:    lsls r1, r1, #16
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[15]
 ; CHECK-LE-NEXT:    strbmi r1, [r0, #15]
@@ -947,74 +1232,73 @@ define i8* @masked_v16i8_pre(i8* %y, i8*
 ; CHECK-BE-NEXT:    vldr d1, [r7, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
-; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
+; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    uxth r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne.u8 r1, q0[0]
-; CHECK-BE-NEXT:    strbne r1, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
+; CHECK-BE-NEXT:    vmovne.u8 r2, q0[0]
+; CHECK-BE-NEXT:    strbne r2, [r0]
+; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[1]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #1]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[1]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #1]
+; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[2]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #2]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[2]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #2]
+; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[3]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #3]
-; CHECK-BE-NEXT:    lsls r1, r2, #27
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[3]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #3]
+; CHECK-BE-NEXT:    lsls r2, r1, #27
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[4]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #26
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[4]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #4]
+; CHECK-BE-NEXT:    lsls r2, r1, #26
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[5]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #5]
-; CHECK-BE-NEXT:    lsls r1, r2, #25
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[5]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #5]
+; CHECK-BE-NEXT:    lsls r2, r1, #25
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[6]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #6]
-; CHECK-BE-NEXT:    lsls r1, r2, #24
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[6]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #6]
+; CHECK-BE-NEXT:    lsls r2, r1, #24
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[7]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #7]
-; CHECK-BE-NEXT:    lsls r1, r2, #23
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[7]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #7]
+; CHECK-BE-NEXT:    lsls r2, r1, #23
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[8]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #22
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[8]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #8]
+; CHECK-BE-NEXT:    lsls r2, r1, #22
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[9]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #9]
-; CHECK-BE-NEXT:    lsls r1, r2, #21
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[9]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #9]
+; CHECK-BE-NEXT:    lsls r2, r1, #21
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[10]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #10]
-; CHECK-BE-NEXT:    lsls r1, r2, #20
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[10]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #10]
+; CHECK-BE-NEXT:    lsls r2, r1, #20
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[11]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #11]
-; CHECK-BE-NEXT:    lsls r1, r2, #19
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[11]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #11]
+; CHECK-BE-NEXT:    lsls r2, r1, #19
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[12]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #12]
-; CHECK-BE-NEXT:    lsls r1, r2, #18
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[12]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #12]
+; CHECK-BE-NEXT:    lsls r2, r1, #18
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[13]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-BE-NEXT:    lsls r1, r2, #17
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[13]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #13]
+; CHECK-BE-NEXT:    lsls r2, r1, #17
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[14]
-; CHECK-BE-NEXT:    strbmi r1, [r0, #14]
-; CHECK-BE-NEXT:    lsls r1, r2, #16
+; CHECK-BE-NEXT:    vmovmi.u8 r2, q0[14]
+; CHECK-BE-NEXT:    strbmi r2, [r0, #14]
+; CHECK-BE-NEXT:    lsls r1, r1, #16
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[15]
 ; CHECK-BE-NEXT:    strbmi r1, [r0, #15]
@@ -1045,12 +1329,11 @@ define i8* @masked_v16i8_post(i8* %y, i8
 ; CHECK-LE-NEXT:    vldr d1, [r7, #8]
 ; CHECK-LE-NEXT:    sub.w r4, r7, #8
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
 ; CHECK-LE-NEXT:    vcmp.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrh.w r2, [sp]
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    uxth r2, r1
+; CHECK-LE-NEXT:    lsls r1, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    vmovne.u8 r1, q0[0]
 ; CHECK-LE-NEXT:    strbne r1, [r0]
@@ -1106,8 +1389,8 @@ define i8* @masked_v16i8_post(i8* %y, i8
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u8 r1, q0[13]
 ; CHECK-LE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-LE-NEXT:    adds r1, r0, #4
 ; CHECK-LE-NEXT:    lsls r3, r2, #17
+; CHECK-LE-NEXT:    add.w r1, r0, #4
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    vmovmi.u8 r3, q0[14]
 ; CHECK-LE-NEXT:    strbmi r3, [r0, #14]
@@ -1133,13 +1416,12 @@ define i8* @masked_v16i8_post(i8* %y, i8
 ; CHECK-BE-NEXT:    vldr d1, [r7, #8]
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
-; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrh.w r2, [sp]
 ; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    vcmp.s8 gt, q1, zr
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    uxth r2, r1
+; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    itt ne
 ; CHECK-BE-NEXT:    vmovne.u8 r1, q0[0]
 ; CHECK-BE-NEXT:    strbne r1, [r0]
@@ -1195,8 +1477,8 @@ define i8* @masked_v16i8_post(i8* %y, i8
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u8 r1, q0[13]
 ; CHECK-BE-NEXT:    strbmi r1, [r0, #13]
-; CHECK-BE-NEXT:    adds r1, r0, #4
 ; CHECK-BE-NEXT:    lsls r3, r2, #17
+; CHECK-BE-NEXT:    add.w r1, r0, #4
 ; CHECK-BE-NEXT:    itt mi
 ; CHECK-BE-NEXT:    vmovmi.u8 r3, q0[14]
 ; CHECK-BE-NEXT:    strbmi r3, [r0, #14]
@@ -1223,10 +1505,22 @@ define arm_aapcs_vfpcc void @masked_v4f3
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vstrne s0, [r0]
@@ -1247,11 +1541,23 @@ define arm_aapcs_vfpcc void @masked_v4f3
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vstrne s4, [r0]
@@ -1277,10 +1583,22 @@ define arm_aapcs_vfpcc void @masked_v4f3
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #20
 ; CHECK-LE-NEXT:    sub sp, #20
-; CHECK-LE-NEXT:    add r1, sp, #16
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #16]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    ittt ne
 ; CHECK-LE-NEXT:    vstrne s0, [sp, #12]
@@ -1309,11 +1627,23 @@ define arm_aapcs_vfpcc void @masked_v4f3
 ; CHECK-BE-NEXT:    .pad #20
 ; CHECK-BE-NEXT:    sub sp, #20
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
-; CHECK-BE-NEXT:    add r1, sp, #16
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #16]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    and r1, r1, #15
 ; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    ittt ne
 ; CHECK-BE-NEXT:    vstrne s4, [sp, #12]
@@ -1350,21 +1680,33 @@ define i8* @masked_v4f32_pre(i8* %y, i8*
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r1, r2, #15
+; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vstrne s0, [r0]
-; CHECK-LE-NEXT:    lsls r1, r2, #30
+; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-LE-NEXT:    lsls r1, r2, #29
+; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-LE-NEXT:    lsls r1, r2, #28
+; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vstrmi s3, [r0, #12]
 ; CHECK-LE-NEXT:    add sp, #8
@@ -1377,22 +1719,34 @@ define i8* @masked_v4f32_pre(i8* %y, i8*
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r1, r2, #15
+; CHECK-BE-NEXT:    lsls r2, r1, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vstrne s0, [r0]
-; CHECK-BE-NEXT:    lsls r1, r2, #30
+; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    it mi
 ; CHECK-BE-NEXT:    vstrmi s1, [r0, #4]
-; CHECK-BE-NEXT:    lsls r1, r2, #29
+; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    it mi
 ; CHECK-BE-NEXT:    vstrmi s2, [r0, #8]
-; CHECK-BE-NEXT:    lsls r1, r2, #28
+; CHECK-BE-NEXT:    lsls r1, r1, #28
 ; CHECK-BE-NEXT:    it mi
 ; CHECK-BE-NEXT:    vstrmi s3, [r0, #12]
 ; CHECK-BE-NEXT:    add sp, #8
@@ -1414,11 +1768,23 @@ define i8* @masked_v4f32_post(i8* %y, i8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    add r2, sp, #4
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    and r2, r2, #15
 ; CHECK-LE-NEXT:    lsls r1, r2, #31
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    vstrne s0, [r0]
@@ -1442,12 +1808,24 @@ define i8* @masked_v4f32_post(i8* %y, i8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    add r2, sp, #4
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp, #4]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    and r2, r2, #15
 ; CHECK-BE-NEXT:    lsls r1, r2, #31
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vstrne s0, [r0]
@@ -1480,11 +1858,35 @@ define arm_aapcs_vfpcc void @masked_v8f1
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    mov r1, sp
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB15_9
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -1552,12 +1954,36 @@ define arm_aapcs_vfpcc void @masked_v8f1
 ; CHECK-BE-NEXT:    .pad #8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    mov r1, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB15_9
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -1630,11 +2056,35 @@ define arm_aapcs_vfpcc void @masked_v8f1
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #40
 ; CHECK-LE-NEXT:    sub sp, #40
-; CHECK-LE-NEXT:    add r1, sp, #32
 ; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT:    vstr p0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r3, r1, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB16_9
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -1718,12 +2168,36 @@ define arm_aapcs_vfpcc void @masked_v8f1
 ; CHECK-BE-NEXT:    .pad #40
 ; CHECK-BE-NEXT:    sub sp, #40
 ; CHECK-BE-NEXT:    vrev64.16 q2, q1
-; CHECK-BE-NEXT:    add r1, sp, #32
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vstr p0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp, #32]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB16_9
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -1815,12 +2289,36 @@ define i8* @masked_v8f16_pre(i8* %y, i8*
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    adds r0, #4
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r3, r12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-LE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    ldrb.w r1, [sp]
-; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-LE-NEXT:    uxtb r1, r2
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    bne .LBB17_9
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
@@ -1890,13 +2388,37 @@ define i8* @masked_v8f16_pre(i8* %y, i8*
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    adds r0, #4
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r3, r12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    ldrb.w r1, [sp]
-; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    bfi r2, r3, #7, #1
+; CHECK-BE-NEXT:    uxtb r1, r2
+; CHECK-BE-NEXT:    lsls r2, r2, #31
 ; CHECK-BE-NEXT:    bne .LBB17_9
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
@@ -1975,12 +2497,36 @@ define i8* @masked_v8f16_post(i8* %y, i8
 ; CHECK-LE-NEXT:    sub sp, #8
 ; CHECK-LE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
-; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    movs r3, #0
 ; CHECK-LE-NEXT:    vcmp.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r2]
-; CHECK-LE-NEXT:    ldrb.w r2, [sp]
+; CHECK-LE-NEXT:    vmrs r12, p0
+; CHECK-LE-NEXT:    and r2, r12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-LE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-LE-NEXT:    lsls r1, r2, #31
+; CHECK-LE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-LE-NEXT:    uxtb r2, r3
+; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    bne .LBB18_12
 ; CHECK-LE-NEXT:  @ %bb.1: @ %else
 ; CHECK-LE-NEXT:    lsls r1, r2, #30
@@ -2046,13 +2592,37 @@ define i8* @masked_v8f16_post(i8* %y, i8
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    vldr d1, [sp, #8]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
-; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r2]
-; CHECK-BE-NEXT:    ldrb.w r2, [sp]
+; CHECK-BE-NEXT:    vmrs r12, p0
+; CHECK-BE-NEXT:    and r2, r12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #12, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #6, #1
+; CHECK-BE-NEXT:    ubfx r2, r12, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-BE-NEXT:    lsls r1, r2, #31
+; CHECK-BE-NEXT:    bfi r3, r2, #7, #1
+; CHECK-BE-NEXT:    uxtb r2, r3
+; CHECK-BE-NEXT:    lsls r1, r3, #31
 ; CHECK-BE-NEXT:    bne .LBB18_12
 ; CHECK-BE-NEXT:  @ %bb.1: @ %else
 ; CHECK-BE-NEXT:    lsls r1, r2, #30

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-pred-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-pred-bitcast.ll?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-pred-bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-pred-bitcast.ll Mon Sep  9 09:35:49 2019
@@ -1,19 +1,55 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 
 define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
-; CHECK-LABEL: bitcast_to_v4i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    and r0, r0, #15
-; CHECK-NEXT:    strb.w r0, [sp]
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    vldr p0, [r0]
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_to_v4i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #4
+; CHECK-LE-NEXT:    sub sp, #4
+; CHECK-LE-NEXT:    and r0, r0, #15
+; CHECK-LE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-LE-NEXT:    vmsr p0, r0
+; CHECK-LE-NEXT:    vpsel q1, q2, q1
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-LE-NEXT:    vmov.32 q2[0], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-LE-NEXT:    vmov.32 q2[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[3], r0
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_to_v4i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #4
+; CHECK-BE-NEXT:    sub sp, #4
+; CHECK-BE-NEXT:    and r0, r0, #15
+; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    vmsr p0, r0
+; CHECK-BE-NEXT:    vpsel q1, q2, q1
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-BE-NEXT:    vmov.32 q2[0], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-BE-NEXT:    vmov.32 q2[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[3], r0
+; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    vpsel q1, q1, q0
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
+; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = bitcast i4 %b to <4 x i1>
   %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer
@@ -21,17 +57,70 @@ entry:
 }
 
 define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) {
-; CHECK-LABEL: bitcast_to_v8i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    strb.w r0, [sp]
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vldr p0, [r0]
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_to_v8i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #8
+; CHECK-LE-NEXT:    sub sp, #8
+; CHECK-LE-NEXT:    uxtb r0, r0
+; CHECK-LE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-LE-NEXT:    vmsr p0, r0
+; CHECK-LE-NEXT:    vpsel q2, q2, q1
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-LE-NEXT:    vmov.16 q1[0], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-LE-NEXT:    vmov.16 q1[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-LE-NEXT:    vmov.16 q1[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-LE-NEXT:    vmov.16 q1[3], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-LE-NEXT:    vmov.16 q1[4], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-LE-NEXT:    vmov.16 q1[5], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-LE-NEXT:    vmov.16 q1[6], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-LE-NEXT:    vmov.16 q1[7], r0
+; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_to_v8i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #8
+; CHECK-BE-NEXT:    sub sp, #8
+; CHECK-BE-NEXT:    uxtb r0, r0
+; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    vmsr p0, r0
+; CHECK-BE-NEXT:    vpsel q2, q2, q1
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-BE-NEXT:    vmov.16 q1[0], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-BE-NEXT:    vmov.16 q1[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-BE-NEXT:    vmov.16 q1[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-BE-NEXT:    vmov.16 q1[3], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-BE-NEXT:    vmov.16 q1[4], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-BE-NEXT:    vmov.16 q1[5], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-BE-NEXT:    vmov.16 q1[6], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-BE-NEXT:    vmov.16 q1[7], r0
+; CHECK-BE-NEXT:    vcmp.i16 ne, q1, zr
+; CHECK-BE-NEXT:    vrev64.16 q1, q0
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    vrev32.16 q0, q0
+; CHECK-BE-NEXT:    vpsel q1, q1, q0
+; CHECK-BE-NEXT:    vrev64.16 q0, q1
+; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = bitcast i8 %b to <8 x i1>
   %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer
@@ -39,25 +128,46 @@ entry:
 }
 
 define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) {
-; CHECK-LABEL: bitcast_to_v16i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r6, r7, lr}
-; CHECK-NEXT:    .setfp r7, sp, #8
-; CHECK-NEXT:    add r7, sp, #8
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    bfc r4, #0, #4
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    strh.w r0, [sp]
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    sub.w r4, r7, #8
-; CHECK-NEXT:    vldr p0, [r0]
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-LABEL: bitcast_to_v16i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
+; CHECK-LE-NEXT:    .setfp r7, sp, #8
+; CHECK-LE-NEXT:    add r7, sp, #8
+; CHECK-LE-NEXT:    .pad #16
+; CHECK-LE-NEXT:    sub sp, #16
+; CHECK-LE-NEXT:    mov r4, sp
+; CHECK-LE-NEXT:    bfc r4, #0, #4
+; CHECK-LE-NEXT:    mov sp, r4
+; CHECK-LE-NEXT:    uxth r0, r0
+; CHECK-LE-NEXT:    sub.w r4, r7, #8
+; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vmsr p0, r0
+; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    mov sp, r4
+; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+;
+; CHECK-BE-LABEL: bitcast_to_v16i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
+; CHECK-BE-NEXT:    .setfp r7, sp, #8
+; CHECK-BE-NEXT:    add r7, sp, #8
+; CHECK-BE-NEXT:    .pad #16
+; CHECK-BE-NEXT:    sub sp, #16
+; CHECK-BE-NEXT:    mov r4, sp
+; CHECK-BE-NEXT:    bfc r4, #0, #4
+; CHECK-BE-NEXT:    mov sp, r4
+; CHECK-BE-NEXT:    vrev64.8 q1, q0
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    uxth r0, r0
+; CHECK-BE-NEXT:    sub.w r4, r7, #8
+; CHECK-BE-NEXT:    vrev32.8 q0, q0
+; CHECK-BE-NEXT:    vmsr p0, r0
+; CHECK-BE-NEXT:    vpsel q1, q1, q0
+; CHECK-BE-NEXT:    vrev64.8 q0, q1
+; CHECK-BE-NEXT:    mov sp, r4
+; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
 entry:
   %c = bitcast i16 %b to <16 x i1>
   %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer
@@ -65,20 +175,36 @@ entry:
 }
 
 define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
-; CHECK-LABEL: bitcast_to_v2i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    and r0, r0, #3
-; CHECK-NEXT:    sbfx r1, r0, #0, #1
-; CHECK-NEXT:    sbfx r0, r0, #1, #1
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_to_v2i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #4
+; CHECK-LE-NEXT:    sub sp, #4
+; CHECK-LE-NEXT:    and r0, r0, #3
+; CHECK-LE-NEXT:    sbfx r1, r0, #0, #1
+; CHECK-LE-NEXT:    sbfx r0, r0, #1, #1
+; CHECK-LE-NEXT:    vmov.32 q1[0], r1
+; CHECK-LE-NEXT:    vmov.32 q1[1], r1
+; CHECK-LE-NEXT:    vmov.32 q1[2], r0
+; CHECK-LE-NEXT:    vmov.32 q1[3], r0
+; CHECK-LE-NEXT:    vand q0, q0, q1
+; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_to_v2i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #4
+; CHECK-BE-NEXT:    sub sp, #4
+; CHECK-BE-NEXT:    and r0, r0, #3
+; CHECK-BE-NEXT:    sbfx r1, r0, #0, #1
+; CHECK-BE-NEXT:    sbfx r0, r0, #1, #1
+; CHECK-BE-NEXT:    vmov.32 q1[0], r1
+; CHECK-BE-NEXT:    vmov.32 q1[1], r1
+; CHECK-BE-NEXT:    vmov.32 q1[2], r0
+; CHECK-BE-NEXT:    vmov.32 q1[3], r0
+; CHECK-BE-NEXT:    vrev64.32 q2, q1
+; CHECK-BE-NEXT:    vand q0, q0, q2
+; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = bitcast i2 %b to <2 x i1>
   %s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> zeroinitializer
@@ -87,16 +213,52 @@ entry:
 
 
 define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) {
-; CHECK-LABEL: bitcast_from_v4i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vcmp.i32 eq, q0, zr
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vstr p0, [r0]
-; CHECK-NEXT:    ldrb.w r0, [sp]
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_from_v4i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #4
+; CHECK-LE-NEXT:    sub sp, #4
+; CHECK-LE-NEXT:    vcmp.i32 eq, q0, zr
+; CHECK-LE-NEXT:    movs r0, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r2, r1, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #2, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r0, r1, #3, #1
+; CHECK-LE-NEXT:    and r0, r0, #15
+; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_from_v4i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #4
+; CHECK-BE-NEXT:    sub sp, #4
+; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    movs r3, #0
+; CHECK-BE-NEXT:    vcmp.i32 eq, q1, zr
+; CHECK-BE-NEXT:    vmrs r0, p0
+; CHECK-BE-NEXT:    and r2, r0, #1
+; CHECK-BE-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
+; CHECK-BE-NEXT:    bfi r3, r1, #1, #1
+; CHECK-BE-NEXT:    ubfx r1, r0, #8, #1
+; CHECK-BE-NEXT:    ubfx r0, r0, #12, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r3, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r0, r0, #0
+; CHECK-BE-NEXT:    bfi r3, r0, #3, #1
+; CHECK-BE-NEXT:    and r0, r3, #15
+; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <4 x i32> %a, zeroinitializer
   %b = bitcast <4 x i1> %c to i4
@@ -104,16 +266,76 @@ entry:
 }
 
 define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) {
-; CHECK-LABEL: bitcast_from_v8i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vcmp.i16 eq, q0, zr
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vstr p0, [r0]
-; CHECK-NEXT:    ldrb.w r0, [sp]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_from_v8i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #8
+; CHECK-LE-NEXT:    sub sp, #8
+; CHECK-LE-NEXT:    vcmp.i16 eq, q0, zr
+; CHECK-LE-NEXT:    movs r0, #0
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    and r2, r1, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #0, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #1, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #2, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #3, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #4, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #5, #1
+; CHECK-LE-NEXT:    ubfx r2, r1, #12, #1
+; CHECK-LE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r0, r2, #6, #1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    bfi r0, r1, #7, #1
+; CHECK-LE-NEXT:    uxtb r0, r0
+; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_from_v8i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #8
+; CHECK-BE-NEXT:    sub sp, #8
+; CHECK-BE-NEXT:    vrev64.16 q1, q0
+; CHECK-BE-NEXT:    vcmp.i16 eq, q1, zr
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    ubfx r0, r1, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r0, #0
+; CHECK-BE-NEXT:    and r0, r1, #1
+; CHECK-BE-NEXT:    rsbs r3, r0, #0
+; CHECK-BE-NEXT:    movs r0, #0
+; CHECK-BE-NEXT:    bfi r0, r3, #0, #1
+; CHECK-BE-NEXT:    bfi r0, r2, #1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r0, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r0, r2, #3, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r0, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r0, r2, #5, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r0, r2, #6, #1
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r0, r1, #7, #1
+; CHECK-BE-NEXT:    uxtb r0, r0
+; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %a, zeroinitializer
   %b = bitcast <8 x i1> %c to i8
@@ -121,24 +343,42 @@ entry:
 }
 
 define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) {
-; CHECK-LABEL: bitcast_from_v16i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r6, r7, lr}
-; CHECK-NEXT:    .setfp r7, sp, #8
-; CHECK-NEXT:    add r7, sp, #8
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    bfc r4, #0, #4
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    sub.w r4, r7, #8
-; CHECK-NEXT:    vcmp.i8 eq, q0, zr
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vstr p0, [r0]
-; CHECK-NEXT:    ldrh.w r0, [sp]
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-LE-LABEL: bitcast_from_v16i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-LE-NEXT:    push {r4, r6, r7, lr}
+; CHECK-LE-NEXT:    .setfp r7, sp, #8
+; CHECK-LE-NEXT:    add r7, sp, #8
+; CHECK-LE-NEXT:    .pad #16
+; CHECK-LE-NEXT:    sub sp, #16
+; CHECK-LE-NEXT:    mov r4, sp
+; CHECK-LE-NEXT:    bfc r4, #0, #4
+; CHECK-LE-NEXT:    mov sp, r4
+; CHECK-LE-NEXT:    vcmp.i8 eq, q0, zr
+; CHECK-LE-NEXT:    sub.w r4, r7, #8
+; CHECK-LE-NEXT:    vmrs r0, p0
+; CHECK-LE-NEXT:    uxth r0, r0
+; CHECK-LE-NEXT:    mov sp, r4
+; CHECK-LE-NEXT:    pop {r4, r6, r7, pc}
+;
+; CHECK-BE-LABEL: bitcast_from_v16i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r6, r7, lr}
+; CHECK-BE-NEXT:    .setfp r7, sp, #8
+; CHECK-BE-NEXT:    add r7, sp, #8
+; CHECK-BE-NEXT:    .pad #16
+; CHECK-BE-NEXT:    sub sp, #16
+; CHECK-BE-NEXT:    mov r4, sp
+; CHECK-BE-NEXT:    bfc r4, #0, #4
+; CHECK-BE-NEXT:    mov sp, r4
+; CHECK-BE-NEXT:    vrev64.8 q1, q0
+; CHECK-BE-NEXT:    sub.w r4, r7, #8
+; CHECK-BE-NEXT:    vcmp.i8 eq, q1, zr
+; CHECK-BE-NEXT:    vmrs r0, p0
+; CHECK-BE-NEXT:    uxth r0, r0
+; CHECK-BE-NEXT:    mov sp, r4
+; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
 entry:
   %c = icmp eq <16 x i8> %a, zeroinitializer
   %b = bitcast <16 x i1> %c to i16
@@ -146,25 +386,46 @@ entry:
 }
 
 define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) {
-; CHECK-LABEL: bitcast_from_v2i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    ands r1, r1, #1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    mvnne r1, #1
-; CHECK-NEXT:    bfi r1, r0, #0, #1
-; CHECK-NEXT:    and r0, r1, #3
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: bitcast_from_v2i1:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .pad #4
+; CHECK-LE-NEXT:    sub sp, #4
+; CHECK-LE-NEXT:    vmov r0, s1
+; CHECK-LE-NEXT:    vmov r1, s0
+; CHECK-LE-NEXT:    vmov r2, s2
+; CHECK-LE-NEXT:    orrs r0, r1
+; CHECK-LE-NEXT:    vmov r1, s3
+; CHECK-LE-NEXT:    cset r0, eq
+; CHECK-LE-NEXT:    orrs r1, r2
+; CHECK-LE-NEXT:    cset r1, eq
+; CHECK-LE-NEXT:    ands r1, r1, #1
+; CHECK-LE-NEXT:    it ne
+; CHECK-LE-NEXT:    mvnne r1, #1
+; CHECK-LE-NEXT:    bfi r1, r0, #0, #1
+; CHECK-LE-NEXT:    and r0, r1, #3
+; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: bitcast_from_v2i1:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .pad #4
+; CHECK-BE-NEXT:    sub sp, #4
+; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vmov r0, s6
+; CHECK-BE-NEXT:    vmov r1, s7
+; CHECK-BE-NEXT:    vmov r2, s5
+; CHECK-BE-NEXT:    orrs r0, r1
+; CHECK-BE-NEXT:    vmov r1, s4
+; CHECK-BE-NEXT:    cset r0, eq
+; CHECK-BE-NEXT:    orrs r1, r2
+; CHECK-BE-NEXT:    cset r1, eq
+; CHECK-BE-NEXT:    ands r1, r1, #1
+; CHECK-BE-NEXT:    it ne
+; CHECK-BE-NEXT:    mvnne r1, #1
+; CHECK-BE-NEXT:    bfi r1, r0, #0, #1
+; CHECK-BE-NEXT:    and r0, r1, #3
+; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %a, zeroinitializer
   %b = bitcast <2 x i1> %c to i2

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-pred-loadstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-pred-loadstore.ll?rev=371419&r1=371418&r2=371419&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-pred-loadstore.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-pred-loadstore.ll Mon Sep  9 09:35:49 2019
@@ -5,15 +5,41 @@
 define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-LE-LABEL: load_v4i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vldr p0, [r0]
+; CHECK-LE-NEXT:    ldrb r0, [r0]
+; CHECK-LE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-LE-NEXT:    vmsr p0, r0
+; CHECK-LE-NEXT:    vpsel q1, q2, q1
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-LE-NEXT:    vmov.32 q2[0], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-LE-NEXT:    vmov.32 q2[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: load_v4i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vldr p0, [r0]
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    vmsr p0, r0
+; CHECK-BE-NEXT:    vpsel q1, q2, q1
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-BE-NEXT:    vmov.32 q2[0], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-BE-NEXT:    vmov.32 q2[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
@@ -27,16 +53,58 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @load_v8i1(<8 x i1> *%src, <8 x i16> %a) {
 ; CHECK-LE-LABEL: load_v8i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vldr p0, [r0]
+; CHECK-LE-NEXT:    ldrb r0, [r0]
+; CHECK-LE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-LE-NEXT:    vmsr p0, r0
+; CHECK-LE-NEXT:    vpsel q2, q2, q1
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-LE-NEXT:    vmov.16 q1[0], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-LE-NEXT:    vmov.16 q1[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-LE-NEXT:    vmov.16 q1[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-LE-NEXT:    vmov.16 q1[3], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-LE-NEXT:    vmov.16 q1[4], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-LE-NEXT:    vmov.16 q1[5], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-LE-NEXT:    vmov.16 q1[6], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-LE-NEXT:    vmov.16 q1[7], r0
+; CHECK-LE-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: load_v8i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    vmsr p0, r0
+; CHECK-BE-NEXT:    vpsel q2, q2, q1
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-BE-NEXT:    vmov.16 q1[0], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-BE-NEXT:    vmov.16 q1[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-BE-NEXT:    vmov.16 q1[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-BE-NEXT:    vmov.16 q1[3], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-BE-NEXT:    vmov.16 q1[4], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-BE-NEXT:    vmov.16 q1[5], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-BE-NEXT:    vmov.16 q1[6], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-BE-NEXT:    vmov.16 q1[7], r0
+; CHECK-BE-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vldr p0, [r0]
 ; CHECK-BE-NEXT:    vrev32.16 q0, q0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
@@ -50,17 +118,19 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @load_v16i1(<16 x i1> *%src, <16 x i8> %a) {
 ; CHECK-LE-LABEL: load_v16i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vldr p0, [r0]
+; CHECK-LE-NEXT:    ldrh r0, [r0]
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vmsr p0, r0
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: load_v16i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrh r0, [r0]
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
-; CHECK-BE-NEXT:    vldr p0, [r0]
 ; CHECK-BE-NEXT:    vrev32.8 q0, q0
+; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
 ; CHECK-BE-NEXT:    bx lr
@@ -106,14 +176,44 @@ define arm_aapcs_vfpcc void @store_v4i1(
 ; CHECK-LE-LABEL: store_v4i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vcmp.i32 eq, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r0]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    and r1, r1, #15
+; CHECK-LE-NEXT:    strb r1, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: store_v4i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i32 eq, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r0]
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    and r3, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    rsb.w r12, r2, #0
+; CHECK-BE-NEXT:    movs r2, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-BE-NEXT:    bfi r2, r12, #1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    rsbs r1, r1, #0
+; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
+; CHECK-BE-NEXT:    bfi r2, r1, #3, #1
+; CHECK-BE-NEXT:    and r1, r2, #15
+; CHECK-BE-NEXT:    strb r1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <4 x i32> %a, zeroinitializer
@@ -125,14 +225,66 @@ define arm_aapcs_vfpcc void @store_v8i1(
 ; CHECK-LE-LABEL: store_v8i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vcmp.i16 eq, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r0]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #6, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #4, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #10, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #5, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #14, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #6, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #7, #1
+; CHECK-LE-NEXT:    strb r1, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: store_v8i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i16 eq, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r0]
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    ubfx r1, r2, #2, #1
+; CHECK-BE-NEXT:    rsb.w r12, r1, #0
+; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r1, #0
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    bfi r1, r12, #1, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #6, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #10, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #5, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r2, r2, #14, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #6, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #7, #1
+; CHECK-BE-NEXT:    strb r1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %a, zeroinitializer
@@ -144,14 +296,16 @@ define arm_aapcs_vfpcc void @store_v16i1
 ; CHECK-LE-LABEL: store_v16i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vcmp.i8 eq, q0, zr
-; CHECK-LE-NEXT:    vstr p0, [r0]
+; CHECK-LE-NEXT:    vmrs r1, p0
+; CHECK-LE-NEXT:    strh r1, [r0]
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: store_v16i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i8 eq, q1, zr
-; CHECK-BE-NEXT:    vstr p0, [r0]
+; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    strh r1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %a, zeroinitializer




More information about the llvm-commits mailing list