[llvm] 48cef1f - [ARM] Create VMOVRRD from adjacent vector extracts

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 20 07:16:02 PDT 2021


Author: David Green
Date: 2021-04-20T15:15:43+01:00
New Revision: 48cef1fa8ee6448e35ffc34259da500d3b81c6b6

URL: https://github.com/llvm/llvm-project/commit/48cef1fa8ee6448e35ffc34259da500d3b81c6b6
DIFF: https://github.com/llvm/llvm-project/commit/48cef1fa8ee6448e35ffc34259da500d3b81c6b6.diff

LOG: [ARM] Create VMOVRRD from adjacent vector extracts

This adds a combine for extract(x, n); extract(x, n+1)  ->
VMOVRRD(extract x, n/2). This allows two vector lanes to be moved at the
same time in a single instruction, and thanks to the other VMOVRRD folds
we have added recently can help reduce the amount of executed
instructions. Floating point types are very similar, but will include a
bitcast to an integer type.

This also adds a shouldRewriteCopySrc, to prevent copy propagation from
DPR to SPR, which can break as not all DPR regs can be extracted from
directly.  Otherwise the machine verifier is unhappy.

Differential Revision: https://reviews.llvm.org/D100244

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
    llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/test/CodeGen/ARM/addsubo-legalization.ll
    llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
    llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
    llvm/test/CodeGen/ARM/combine-vmovdrr.ll
    llvm/test/CodeGen/ARM/vselect_imax.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
    llvm/test/CodeGen/Thumb2/active_lane_mask.ll
    llvm/test/CodeGen/Thumb2/mve-abs.ll
    llvm/test/CodeGen/Thumb2/mve-ctlz.ll
    llvm/test/CodeGen/Thumb2/mve-ctpop.ll
    llvm/test/CodeGen/Thumb2/mve-cttz.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-fmath.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
    llvm/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/test/CodeGen/Thumb2/mve-masked-store.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-neg.ll
    llvm/test/CodeGen/Thumb2/mve-nofloat.ll
    llvm/test/CodeGen/Thumb2/mve-phireg.ll
    llvm/test/CodeGen/Thumb2/mve-pred-and.ll
    llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
    llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
    llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
    llvm/test/CodeGen/Thumb2/mve-pred-not.ll
    llvm/test/CodeGen/Thumb2/mve-pred-or.ll
    llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
    llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-sext.ll
    llvm/test/CodeGen/Thumb2/mve-shifts.ll
    llvm/test/CodeGen/Thumb2/mve-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
    llvm/test/CodeGen/Thumb2/mve-vabd.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vaddv.ll
    llvm/test/CodeGen/Thumb2/mve-vcmp.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
    llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll
    llvm/test/CodeGen/Thumb2/mve-vmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
    llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
    llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll
    llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b702b5319fafd..33d5aaf6b8c26 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -909,3 +909,17 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   }
   return false;
 }
+
+bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                                               unsigned DefSubReg,
+                                               const TargetRegisterClass *SrcRC,
+                                               unsigned SrcSubReg) const {
+  // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2).
+  if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 &&
+      SrcRC == &ARM::DPRRegClass &&
+      (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1))
+    return false;
+
+  return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
+                                                  SrcRC, SrcSubReg);
+}
\ No newline at end of file

diff  --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5afb6c6aa015d..d29535bb91e5b 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -209,6 +209,11 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
                       unsigned DstSubReg,
                       const TargetRegisterClass *NewRC,
                       LiveIntervals &LIS) const override;
+
+  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                            unsigned DefSubReg,
+                            const TargetRegisterClass *SrcRC,
+                            unsigned SrcSubReg) const;
 };
 
 } // end namespace llvm

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a3245581e81b7..5084860a728a0 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14081,6 +14081,69 @@ static SDValue PerformInsertEltCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
 }
 
+// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
+// directly or bitcast to an integer if the original is a float vector.
+// extract(x, n); extract(x, n+1)  ->  VMOVRRD(extract v2f64 x, n/2)
+// bitcast(extract(x, n)); bitcast(extract(x, n+1))  ->  VMOVRRD(extract x, n/2)
+static SDValue
+PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32)
+    return SDValue();
+
+  SDValue Ext = SDValue(N, 0);
+  if (Ext.getOpcode() == ISD::BITCAST &&
+      Ext.getOperand(0).getValueType() == MVT::f32)
+    Ext = Ext.getOperand(0);
+  if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+      Ext.getConstantOperandVal(1) % 2 != 0)
+    return SDValue();
+  if (Ext->use_size() == 1 &&
+      (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
+       Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
+    return SDValue();
+
+  SDValue Op0 = Ext.getOperand(0);
+  EVT VecVT = Op0.getValueType();
+  unsigned Lane = Ext.getConstantOperandVal(1);
+  if (VecVT.getVectorNumElements() != 4)
+    return SDValue();
+
+  // Find another extract, of Lane + 1
+  auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
+    return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+           isa<ConstantSDNode>(V->getOperand(1)) &&
+           V->getConstantOperandVal(1) == Lane + 1;
+  });
+  if (OtherIt == Op0->uses().end())
+    return SDValue();
+
+  // For float extracts, we need to be converting to a i32 for both vector
+  // lanes.
+  SDValue OtherExt(*OtherIt, 0);
+  if (OtherExt.getValueType() != MVT::i32) {
+    if (OtherExt->use_size() != 1 ||
+        OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
+        OtherExt->use_begin()->getValueType(0) != MVT::i32)
+      return SDValue();
+    OtherExt = SDValue(*OtherExt->use_begin(), 0);
+  }
+
+  // Convert the type to a f64 and extract with a VMOVRRD.
+  SDValue F64 = DCI.DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+      DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
+      DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
+  SDValue VMOVRRD =
+      DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
+
+  DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
+  return VMOVRRD;
+}
+
 static SDValue PerformExtractEltCombine(SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const ARMSubtarget *ST) {
@@ -14115,6 +14178,10 @@ static SDValue PerformExtractEltCombine(SDNode *N,
       return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
   }
 
+  // extract x, n; extract x, n+1  ->  VMOVRRD x
+  if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
+    return R;
+
   return SDValue();
 }
 
@@ -16535,8 +16602,10 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   return Res;
 }
 
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
-                                    const ARMSubtarget *ST) {
+static SDValue PerformBITCASTCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *ST) {
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Src = N->getOperand(0);
   EVT DstVT = N->getValueType(0);
 
@@ -16562,6 +16631,10 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
       DAG.getDataLayout().isBigEndian())
     return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
 
+  // bitcast(extract(x, n)); bitcast(extract(x, n+1))  ->  VMOVRRD x
+  if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
+    return R;
+
   return SDValue();
 }
 
@@ -16633,7 +16706,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ISD::BITCAST:
-    return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
+    return PerformBITCASTCombine(N, DCI, Subtarget);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
   case ARMISD::VECTOR_REG_CAST:

diff  --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index e3a48ed0c14f1..64a6faf85345f 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -12,26 +12,22 @@ define <2 x i1> @uaddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmov r3, r2, d18
 ; CHECK-NEXT:    vadd.i64 q8, q9, q8
-; CHECK-NEXT:    vmov.32 r3, d18[0]
-; CHECK-NEXT:    vmov.32 r2, d18[1]
-; CHECK-NEXT:    vmov.32 r12, d16[0]
-; CHECK-NEXT:    vmov.32 lr, d16[1]
-; CHECK-NEXT:    vmov.32 r4, d17[0]
-; CHECK-NEXT:    vmov.32 r5, d19[0]
-; CHECK-NEXT:    vmov.32 r6, d17[1]
-; CHECK-NEXT:    vmov.32 r7, d19[1]
-; CHECK-NEXT:    subs.w r3, r12, r3
-; CHECK-NEXT:    sbcs.w r2, lr, r2
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmov r6, r7, d19
+; CHECK-NEXT:    vmov lr, r12, d16
+; CHECK-NEXT:    vmov r4, r5, d17
+; CHECK-NEXT:    subs.w r3, lr, r3
+; CHECK-NEXT:    sbcs.w r2, r12, r2
 ; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    subs r3, r4, r5
-; CHECK-NEXT:    sbcs.w r3, r6, r7
+; CHECK-NEXT:    subs r3, r4, r6
+; CHECK-NEXT:    sbcs.w r3, r5, r7
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r1, #1
 ; CHECK-NEXT:    cmp r1, #0
@@ -57,24 +53,20 @@ define <2 x i1> @usubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    vsub.i64 q8, q9, q8
-; CHECK-NEXT:    vmov.32 r12, d18[0]
-; CHECK-NEXT:    vmov.32 lr, d18[1]
-; CHECK-NEXT:    vmov.32 r3, d16[0]
-; CHECK-NEXT:    vmov.32 r2, d16[1]
-; CHECK-NEXT:    vmov.32 r4, d19[0]
-; CHECK-NEXT:    vmov.32 r5, d17[0]
-; CHECK-NEXT:    vmov.32 r6, d19[1]
-; CHECK-NEXT:    vmov.32 r7, d17[1]
-; CHECK-NEXT:    subs.w r3, r12, r3
-; CHECK-NEXT:    sbcs.w r2, lr, r2
+; CHECK-NEXT:    vmov lr, r12, d18
+; CHECK-NEXT:    vmov r4, r5, d19
+; CHECK-NEXT:    vmov r3, r2, d16
+; CHECK-NEXT:    vmov r6, r7, d17
+; CHECK-NEXT:    subs.w r3, lr, r3
+; CHECK-NEXT:    sbcs.w r2, r12, r2
 ; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    subs r3, r4, r5
-; CHECK-NEXT:    sbcs.w r3, r6, r7
+; CHECK-NEXT:    subs r3, r4, r6
+; CHECK-NEXT:    sbcs.w r3, r5, r7
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r1, #1
 ; CHECK-NEXT:    cmp r1, #0

diff  --git a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
index 693f33553591a..96b6f3237df10 100644
--- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
@@ -492,24 +492,23 @@ entry:
 define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
 ; CHECK-LABEL: conv_v8f16_to_i128:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    adr r1, .LCPI18_0
 ; CHECK-NEXT:    vrev64.16 q9, q0
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1:128]
 ; CHECK-NEXT:    vrev64.16 q8, q8
 ; CHECK-NEXT:    vadd.f16 q8, q9, q8
 ; CHECK-NEXT:    vrev32.16 q8, q8
-; CHECK-NEXT:    vmov.32 r12, d17[1]
-; CHECK-NEXT:    vmov.32 r2, d17[0]
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    vmov.32 r1, d16[0]
-; CHECK-NEXT:    subs r12, r12, #1
-; CHECK-NEXT:    str r12, [r0, #12]
-; CHECK-NEXT:    sbcs r2, r2, #0
-; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    sbc r1, r1, #0
-; CHECK-NEXT:    stm r0, {r1, r3}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r12, r2, d17
+; CHECK-NEXT:    vmov r3, r1, d16
+; CHECK-NEXT:    subs lr, r2, #1
+; CHECK-NEXT:    sbcs r2, r12, #0
+; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    sbc r3, r3, #0
+; CHECK-NEXT:    str r3, [r0]
+; CHECK-NEXT:    stmib r0, {r1, r2, lr}
+; CHECK-NEXT:    pop {r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI18_0:

diff  --git a/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll b/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
index 73632abcaa7cd..c0c9d71e197f9 100644
--- a/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
@@ -1050,21 +1050,13 @@ define fp128 @test_f128_v2f64(<2 x double> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d16, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d17, r1, r0
 ; SOFT-NEXT:    vadd.f64 d19, d16, d16
 ; SOFT-NEXT:    vadd.f64 d18, d17, d17
 ; SOFT-NEXT:    vrev64.32 q8, q9
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1076,19 +1068,11 @@ define fp128 @test_f128_v2f64(<2 x double> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vadd.f64 d17, d1, d1
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vadd.f64 d16, d0, d0
 ; HARD-NEXT:    vrev64.32 q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}
@@ -1106,20 +1090,12 @@ define fp128 @test_f128_v2i64(<2 x i64> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d17, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d16, r1, r0
 ; SOFT-NEXT:    vadd.i64 q8, q8, q8
 ; SOFT-NEXT:    vrev64.32 q8, q8
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1131,18 +1107,10 @@ define fp128 @test_f128_v2i64(<2 x i64> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vadd.i64 q8, q0, q0
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vrev64.32 q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}
@@ -1160,20 +1128,12 @@ define fp128 @test_f128_v4f32(<4 x float> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d17, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d16, r1, r0
 ; SOFT-NEXT:    vrev64.32 q8, q8
 ; SOFT-NEXT:    vadd.f32 q8, q8, q8
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1185,18 +1145,10 @@ define fp128 @test_f128_v4f32(<4 x float> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vrev64.32 q8, q0
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vadd.f32 q8, q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}
@@ -1214,20 +1166,12 @@ define fp128 @test_f128_v4i32(<4 x i32> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d17, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d16, r1, r0
 ; SOFT-NEXT:    vrev64.32 q8, q8
 ; SOFT-NEXT:    vadd.i32 q8, q8, q8
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1239,18 +1183,10 @@ define fp128 @test_f128_v4i32(<4 x i32> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vrev64.32 q8, q0
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vadd.i32 q8, q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}
@@ -1268,21 +1204,13 @@ define fp128 @test_f128_v8i16(<8 x i16> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d17, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d16, r1, r0
 ; SOFT-NEXT:    vrev64.16 q8, q8
 ; SOFT-NEXT:    vadd.i16 q8, q8, q8
 ; SOFT-NEXT:    vrev32.16 q8, q8
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1294,19 +1222,11 @@ define fp128 @test_f128_v8i16(<8 x i16> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vrev64.16 q8, q0
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vadd.i16 q8, q8, q8
 ; HARD-NEXT:    vrev32.16 q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}
@@ -1324,21 +1244,13 @@ define fp128 @test_f128_v16i8(<16 x i8> %p) {
 ; SOFT-NEXT:    .pad #16
 ; SOFT-NEXT:    sub sp, sp, #16
 ; SOFT-NEXT:    vmov d17, r3, r2
-; SOFT-NEXT:    add r12, sp, #12
 ; SOFT-NEXT:    vmov d16, r1, r0
 ; SOFT-NEXT:    vrev64.8 q8, q8
 ; SOFT-NEXT:    vadd.i8 q8, q8, q8
 ; SOFT-NEXT:    vrev32.8 q8, q8
-; SOFT-NEXT:    vmov.32 r0, d16[0]
-; SOFT-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #8
-; SOFT-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; SOFT-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; SOFT-NEXT:    add r12, sp, #4
-; SOFT-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; SOFT-NEXT:    vmov.32 r1, d16[1]
-; SOFT-NEXT:    vmov.32 r2, d17[0]
-; SOFT-NEXT:    vmov.32 r3, d17[1]
+; SOFT-NEXT:    vmov r2, r3, d17
+; SOFT-NEXT:    vmov r0, r1, d16
+; SOFT-NEXT:    stm sp, {r0, r1, r2, r3}
 ; SOFT-NEXT:    bl __addtf3
 ; SOFT-NEXT:    add sp, sp, #16
 ; SOFT-NEXT:    pop {r11, pc}
@@ -1350,19 +1262,11 @@ define fp128 @test_f128_v16i8(<16 x i8> %p) {
 ; HARD-NEXT:    .pad #16
 ; HARD-NEXT:    sub sp, sp, #16
 ; HARD-NEXT:    vrev64.8 q8, q0
-; HARD-NEXT:    add r12, sp, #12
 ; HARD-NEXT:    vadd.i8 q8, q8, q8
 ; HARD-NEXT:    vrev32.8 q8, q8
-; HARD-NEXT:    vmov.32 r0, d16[0]
-; HARD-NEXT:    vst1.32 {d17[1]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #8
-; HARD-NEXT:    vst1.32 {d16[0]}, [sp:32]
-; HARD-NEXT:    vst1.32 {d17[0]}, [r12:32]
-; HARD-NEXT:    add r12, sp, #4
-; HARD-NEXT:    vst1.32 {d16[1]}, [r12:32]
-; HARD-NEXT:    vmov.32 r1, d16[1]
-; HARD-NEXT:    vmov.32 r2, d17[0]
-; HARD-NEXT:    vmov.32 r3, d17[1]
+; HARD-NEXT:    vmov r2, r3, d17
+; HARD-NEXT:    vmov r0, r1, d16
+; HARD-NEXT:    stm sp, {r0, r1, r2, r3}
 ; HARD-NEXT:    bl __addtf3
 ; HARD-NEXT:    add sp, sp, #16
 ; HARD-NEXT:    pop {r11, pc}

diff  --git a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
index 5a9097a6c39e6..b3012bf842550 100644
--- a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
+++ b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -75,10 +75,8 @@ define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
 ; CHECK-LABEL: severalUses:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vmov.32 r2, d16[1]
-; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    vmov r0, r2, d16
 ; CHECK-NEXT:    vldr d18, [r1]
-; CHECK-NEXT:    vmov d16, r0, r2
 ; CHECK-NEXT:    vtbl.8 d16, {d16, d17}, d18
 ; CHECK-NEXT:    vstr d16, [r1]
 ; CHECK-NEXT:    mov r1, r2

diff  --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index 0853ce93ba5a3..67ec68da53ae4 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -72,55 +72,47 @@ define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2,
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT:    vld1.64 {d22, d23}, [r0:128]!
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r0:128]!
+; CHECK-NEXT:    vmov r4, r6, d16
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1:128]
-; CHECK-NEXT:    mov r1, #0
 ; CHECK-NEXT:    vld1.64 {d20, d21}, [r0:128]
-; CHECK-NEXT:    vmov.32 r12, d18[0]
-; CHECK-NEXT:    vmov.32 r2, d20[0]
-; CHECK-NEXT:    vmov.32 lr, d18[1]
-; CHECK-NEXT:    vmov.32 r0, d20[1]
-; CHECK-NEXT:    vmov.32 r7, d16[0]
-; CHECK-NEXT:    vmov.32 r5, d22[0]
-; CHECK-NEXT:    vmov.32 r4, d22[1]
-; CHECK-NEXT:    vmov.32 r6, d19[0]
-; CHECK-NEXT:    subs r2, r2, r12
-; CHECK-NEXT:    vmov.32 r2, d16[1]
-; CHECK-NEXT:    sbcs r0, r0, lr
+; CHECK-NEXT:    vmov lr, r12, d18
 ; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmov r2, r1, d20
+; CHECK-NEXT:    subs r2, r2, lr
+; CHECK-NEXT:    vmov r7, lr, d17
+; CHECK-NEXT:    vmov r2, r5, d22
+; CHECK-NEXT:    sbcs r1, r1, r12
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    mvnne r1, #0
+; CHECK-NEXT:    subs r2, r2, r4
+; CHECK-NEXT:    sbcs r6, r5, r6
+; CHECK-NEXT:    vmov r2, r12, d19
+; CHECK-NEXT:    vmov r5, r4, d21
+; CHECK-NEXT:    mov r6, #0
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    mvnne r6, #0
+; CHECK-NEXT:    subs r2, r5, r2
+; CHECK-NEXT:    sbcs r4, r4, r12
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    vmov r4, r5, d23
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    subs r7, r4, r7
+; CHECK-NEXT:    sbcs r7, r5, lr
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mvnne r0, #0
-; CHECK-NEXT:    subs r7, r5, r7
-; CHECK-NEXT:    vmov.32 r7, d21[0]
-; CHECK-NEXT:    vmov.32 r5, d19[1]
-; CHECK-NEXT:    sbcs r2, r4, r2
-; CHECK-NEXT:    vmov.32 r4, d21[1]
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    vdup.32 d25, r0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    subs r7, r7, r6
-; CHECK-NEXT:    vmov.32 r6, d23[0]
-; CHECK-NEXT:    vmov.32 r7, d17[0]
-; CHECK-NEXT:    sbcs r5, r4, r5
-; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    vmov.32 r5, d17[1]
-; CHECK-NEXT:    subs r7, r6, r7
-; CHECK-NEXT:    vmov.32 r7, d23[1]
-; CHECK-NEXT:    sbcs r7, r7, r5
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mvnne r1, #0
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    vdup.32 d25, r1
-; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vdup.32 d24, r2
-; CHECK-NEXT:    vdup.32 d27, r4
+; CHECK-NEXT:    vdup.32 d24, r6
+; CHECK-NEXT:    vdup.32 d27, r2
 ; CHECK-NEXT:    vbit q8, q11, q12
-; CHECK-NEXT:    vdup.32 d26, r0
+; CHECK-NEXT:    vdup.32 d26, r1
 ; CHECK-NEXT:    vbit q9, q10, q13
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r3:128]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
@@ -142,111 +134,98 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
                            %T1_19* %blend, %T0_19* %storeaddr) {
 ; CHECK-LABEL: func_blend19:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEXT:    add r2, r1, #48
-; CHECK-NEXT:    add r5, r1, #32
+; CHECK-NEXT:    mov r8, #0
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2:128]
 ; CHECK-NEXT:    add r2, r0, #48
-; CHECK-NEXT:    add r6, r0, #32
-; CHECK-NEXT:    mov r7, #0
+; CHECK-NEXT:    mov lr, #0
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r2:128]
-; CHECK-NEXT:    vmov.32 r12, d16[0]
-; CHECK-NEXT:    vmov.32 r2, d18[0]
-; CHECK-NEXT:    vmov.32 lr, d16[1]
-; CHECK-NEXT:    vmov.32 r4, d18[1]
-; CHECK-NEXT:    vld1.64 {d28, d29}, [r0:128]!
-; CHECK-NEXT:    vld1.64 {d26, d27}, [r5:128]
-; CHECK-NEXT:    vld1.64 {d30, d31}, [r6:128]
-; CHECK-NEXT:    vmov.32 r5, d17[0]
-; CHECK-NEXT:    vld1.64 {d22, d23}, [r0:128]
-; CHECK-NEXT:    vmov.32 r0, d17[1]
-; CHECK-NEXT:    vld1.64 {d24, d25}, [r1:128]!
-; CHECK-NEXT:    vld1.64 {d20, d21}, [r1:128]
-; CHECK-NEXT:    mov r1, #0
-; CHECK-NEXT:    subs r2, r2, r12
+; CHECK-NEXT:    vmov r2, r12, d16
+; CHECK-NEXT:    vmov r6, r7, d17
+; CHECK-NEXT:    vmov r4, r5, d18
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    sbcs r2, r5, r12
 ; CHECK-NEXT:    mov r12, #0
-; CHECK-NEXT:    vmov.32 r2, d19[0]
-; CHECK-NEXT:    sbcs r6, r4, lr
-; CHECK-NEXT:    vmov.32 r4, d24[0]
-; CHECK-NEXT:    vmov.32 r6, d19[1]
+; CHECK-NEXT:    vmov r2, r4, d19
 ; CHECK-NEXT:    movlt r12, #1
 ; CHECK-NEXT:    cmp r12, #0
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    mvnne r12, #0
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    vmov.32 r5, d28[0]
+; CHECK-NEXT:    vld1.64 {d24, d25}, [r5:128]!
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r5:128]
+; CHECK-NEXT:    subs r2, r2, r6
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.64 {d26, d27}, [r2:128]!
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r2:128]
+; CHECK-NEXT:    sbcs r2, r4, r7
+; CHECK-NEXT:    vmov r4, r5, d21
+; CHECK-NEXT:    movlt r8, #1
+; CHECK-NEXT:    vmov r6, r7, d23
+; CHECK-NEXT:    cmp r8, #0
+; CHECK-NEXT:    mvnne r8, #0
+; CHECK-NEXT:    vld1.64 {d28, d29}, [r0:128]
+; CHECK-NEXT:    add r0, r1, #32
+; CHECK-NEXT:    vld1.64 {d30, d31}, [r0:128]
+; CHECK-NEXT:    vmov r0, r1, d20
+; CHECK-NEXT:    vdup.32 d7, r8
+; CHECK-NEXT:    vdup.32 d6, r12
+; CHECK-NEXT:    subs r4, r6, r4
+; CHECK-NEXT:    sbcs r4, r7, r5
+; CHECK-NEXT:    vmov r5, r6, d24
+; CHECK-NEXT:    vmov r7, r2, d26
+; CHECK-NEXT:    mov r4, #0
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    mvnne r4, #0
+; CHECK-NEXT:    vdup.32 d5, r4
+; CHECK-NEXT:    subs r5, r7, r5
+; CHECK-NEXT:    sbcs r2, r2, r6
+; CHECK-NEXT:    vmov r7, r6, d27
+; CHECK-NEXT:    vmov r2, r9, d25
+; CHECK-NEXT:    mov r5, #0
+; CHECK-NEXT:    movlt r5, #1
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    mvnne r5, #0
+; CHECK-NEXT:    subs r2, r7, r2
+; CHECK-NEXT:    sbcs r2, r6, r9
+; CHECK-NEXT:    vmov r6, r7, d22
 ; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    sbcs r0, r6, r0
-; CHECK-NEXT:    vmov.32 r6, d28[1]
-; CHECK-NEXT:    vmov.32 r0, d24[1]
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    vdup.32 d7, r2
-; CHECK-NEXT:    vdup.32 d6, r12
-; CHECK-NEXT:    subs r5, r5, r4
-; CHECK-NEXT:    vmov.32 r4, d25[1]
-; CHECK-NEXT:    vmov.32 r5, d25[0]
-; CHECK-NEXT:    sbcs r0, r6, r0
+; CHECK-NEXT:    vdup.32 d1, r2
+; CHECK-NEXT:    vdup.32 d0, r5
+; CHECK-NEXT:    vbit q12, q13, q0
+; CHECK-NEXT:    subs r0, r6, r0
+; CHECK-NEXT:    vmov r2, r6, d28
+; CHECK-NEXT:    sbcs r0, r7, r1
+; CHECK-NEXT:    mov r7, #0
+; CHECK-NEXT:    vmov r0, r1, d30
+; CHECK-NEXT:    movlt r7, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    vmov r2, r5, d29
+; CHECK-NEXT:    sbcs r0, r6, r1
 ; CHECK-NEXT:    mov r6, #0
-; CHECK-NEXT:    vmov.32 r0, d29[0]
+; CHECK-NEXT:    vmov r0, r1, d31
 ; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r5, r1
+; CHECK-NEXT:    movlt lr, #1
+; CHECK-NEXT:    cmp lr, #0
+; CHECK-NEXT:    mvnne lr, #0
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    mvnne r6, #0
-; CHECK-NEXT:    subs r0, r0, r5
-; CHECK-NEXT:    vmov.32 r5, d21[0]
-; CHECK-NEXT:    vmov.32 r0, d29[1]
-; CHECK-NEXT:    sbcs r0, r0, r4
-; CHECK-NEXT:    vmov.32 r4, d23[0]
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    movlt r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mvnne r0, #0
-; CHECK-NEXT:    vdup.32 d1, r0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    vdup.32 d0, r6
-; CHECK-NEXT:    vmov.32 r6, d22[0]
-; CHECK-NEXT:    vbit q12, q14, q0
-; CHECK-NEXT:    subs r5, r4, r5
-; CHECK-NEXT:    vmov.32 r4, d23[1]
-; CHECK-NEXT:    vmov.32 r5, d21[1]
-; CHECK-NEXT:    sbcs r5, r4, r5
-; CHECK-NEXT:    vmov.32 r4, d20[1]
-; CHECK-NEXT:    vmov.32 r5, d20[0]
-; CHECK-NEXT:    movlt r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mvnne r0, #0
-; CHECK-NEXT:    vdup.32 d5, r0
-; CHECK-NEXT:    add r0, r3, #32
-; CHECK-NEXT:    subs r6, r6, r5
-; CHECK-NEXT:    vmov.32 r5, d26[0]
-; CHECK-NEXT:    vmov.32 r6, d22[1]
-; CHECK-NEXT:    sbcs r6, r6, r4
-; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    vmov.32 r6, d30[0]
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    subs r6, r6, r5
-; CHECK-NEXT:    vmov.32 r5, d30[1]
-; CHECK-NEXT:    vmov.32 r6, d26[1]
-; CHECK-NEXT:    sbcs r6, r5, r6
-; CHECK-NEXT:    vmov.32 r5, d31[0]
-; CHECK-NEXT:    vmov.32 r6, d27[0]
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    subs r6, r5, r6
-; CHECK-NEXT:    vmov.32 r5, d31[1]
-; CHECK-NEXT:    vmov.32 r6, d27[1]
-; CHECK-NEXT:    sbcs r6, r5, r6
-; CHECK-NEXT:    movlt r7, #1
+; CHECK-NEXT:    vdup.32 d3, lr
+; CHECK-NEXT:    vdup.32 d2, r6
 ; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    vorr q13, q1, q1
 ; CHECK-NEXT:    mvnne r7, #0
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mvnne r1, #0
-; CHECK-NEXT:    vdup.32 d3, r7
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    vdup.32 d2, r1
-; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vbit q13, q15, q1
-; CHECK-NEXT:    vdup.32 d4, r4
+; CHECK-NEXT:    vdup.32 d4, r7
+; CHECK-NEXT:    add r0, r3, #32
+; CHECK-NEXT:    vbsl q13, q14, q15
 ; CHECK-NEXT:    vbit q10, q11, q2
 ; CHECK-NEXT:    vbit q8, q9, q3
 ; CHECK-NEXT:    vst1.64 {d26, d27}, [r0:128]
@@ -254,7 +233,7 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
 ; CHECK-NEXT:    vst1.64 {d24, d25}, [r3:128]!
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r3:128]
-; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
   %v0 = load %T0_19, %T0_19* %loadaddr
   %v1 = load %T0_19, %T0_19* %loadaddr2
@@ -280,202 +259,170 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, sp, #8
-; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r8:128]!
-; CHECK-NEXT:    add r10, r0, #64
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r9:128]!
-; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    add r9, r1, #64
+; CHECK-NEXT:    mov r2, #32
+; CHECK-NEXT:    add r8, r0, #64
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r9:128], r2
+; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r8:128], r2
+; CHECK-NEXT:    vmov r7, r5, d17
+; CHECK-NEXT:    vmov r6, r2, d19
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    vmov.32 r6, d18[0]
-; CHECK-NEXT:    vmov.32 r4, d16[1]
-; CHECK-NEXT:    vmov.32 r7, d18[1]
-; CHECK-NEXT:    vmov.32 r5, d17[0]
-; CHECK-NEXT:    subs r2, r6, r2
-; CHECK-NEXT:    mov r6, #0
-; CHECK-NEXT:    vmov.32 r2, d19[0]
-; CHECK-NEXT:    sbcs r7, r7, r4
-; CHECK-NEXT:    movlt r6, #1
-; CHECK-NEXT:    vmov.32 r7, d17[1]
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    vmov.32 r2, d19[1]
-; CHECK-NEXT:    sbcs r2, r2, r7
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r10:128]!
+; CHECK-NEXT:    subs r7, r6, r7
+; CHECK-NEXT:    sbcs r2, r2, r5
+; CHECK-NEXT:    vmov r5, r6, d16
+; CHECK-NEXT:    vmov r7, r4, d18
 ; CHECK-NEXT:    mov r2, #0
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    vdup.32 d21, r2
-; CHECK-NEXT:    mvnne r6, #0
-; CHECK-NEXT:    vdup.32 d20, r6
-; CHECK-NEXT:    mov r2, #32
-; CHECK-NEXT:    add r6, r1, #64
-; CHECK-NEXT:    vld1.64 {d24, d25}, [r10:128], r2
-; CHECK-NEXT:    vbit q8, q9, q10
-; CHECK-NEXT:    vld1.64 {d28, d29}, [r6:128], r2
-; CHECK-NEXT:    vmov.32 r4, d29[0]
-; CHECK-NEXT:    vmov.32 r5, d25[0]
-; CHECK-NEXT:    vld1.64 {d0, d1}, [r9:128]
-; CHECK-NEXT:    vld1.64 {d2, d3}, [r8:128]
-; CHECK-NEXT:    vld1.64 {d22, d23}, [r6:128]!
-; CHECK-NEXT:    vld1.64 {d20, d21}, [r6:128]
-; CHECK-NEXT:    vmov.32 r6, d0[0]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r10:128]!
-; CHECK-NEXT:    vmov.32 r9, d23[0]
-; CHECK-NEXT:    vmov.32 r11, d19[0]
-; CHECK-NEXT:    vmov.32 r8, d23[1]
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d25[1]
-; CHECK-NEXT:    vmov.32 r4, d29[1]
-; CHECK-NEXT:    sbcs r4, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d24[0]
-; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vdup.32 d5, r4
-; CHECK-NEXT:    vmov.32 r4, d28[0]
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d24[1]
-; CHECK-NEXT:    vmov.32 r4, d28[1]
-; CHECK-NEXT:    sbcs r4, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d1[0]
-; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vdup.32 d4, r4
-; CHECK-NEXT:    vmov.32 r4, d3[0]
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d1[1]
-; CHECK-NEXT:    vmov.32 r4, d3[1]
-; CHECK-NEXT:    sbcs r4, r5, r4
-; CHECK-NEXT:    add r5, r1, #32
-; CHECK-NEXT:    vld1.64 {d26, d27}, [r5:128]
-; CHECK-NEXT:    add r5, r1, #48
+; CHECK-NEXT:    subs r5, r7, r5
+; CHECK-NEXT:    sbcs r4, r4, r6
 ; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    add r1, r1, #80
-; CHECK-NEXT:    vld1.64 {d30, d31}, [r5:128]
 ; CHECK-NEXT:    movlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    vbif q12, q14, q2
-; CHECK-NEXT:    vmov.32 r5, d2[0]
 ; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vdup.32 d29, r4
-; CHECK-NEXT:    vmov.32 r4, d31[1]
-; CHECK-NEXT:    subs r5, r6, r5
-; CHECK-NEXT:    vmov.32 r6, d0[1]
-; CHECK-NEXT:    vmov.32 r5, d2[1]
-; CHECK-NEXT:    sbcs r5, r6, r5
-; CHECK-NEXT:    add r6, r0, #48
-; CHECK-NEXT:    mov r5, #0
-; CHECK-NEXT:    vld1.64 {d6, d7}, [r6:128]
-; CHECK-NEXT:    movlt r5, #1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    mvnne r5, #0
-; CHECK-NEXT:    vmov.32 r7, d7[0]
-; CHECK-NEXT:    vdup.32 d28, r5
-; CHECK-NEXT:    vmov.32 r5, d31[0]
-; CHECK-NEXT:    vbsl q14, q0, q1
-; CHECK-NEXT:    vmov.32 r6, d7[1]
-; CHECK-NEXT:    vmov.32 r2, d6[0]
-; CHECK-NEXT:    subs r5, r7, r5
-; CHECK-NEXT:    vmov.32 r7, d6[1]
-; CHECK-NEXT:    sbcs r4, r6, r4
-; CHECK-NEXT:    vmov.32 r6, d30[0]
-; CHECK-NEXT:    vmov.32 r5, d30[1]
+; CHECK-NEXT:    vdup.32 d20, r4
+; CHECK-NEXT:    vmov r2, r4, d23
+; CHECK-NEXT:    vbit q8, q9, q10
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r11:128]!
+; CHECK-NEXT:    vmov r7, r5, d19
+; CHECK-NEXT:    subs r2, r7, r2
+; CHECK-NEXT:    sbcs r2, r5, r4
+; CHECK-NEXT:    vmov r5, r7, d18
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    mvnne r2, #0
+; CHECK-NEXT:    vdup.32 d21, r2
+; CHECK-NEXT:    vmov r2, r4, d22
+; CHECK-NEXT:    subs r2, r5, r2
+; CHECK-NEXT:    sbcs r2, r7, r4
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    mvnne r2, #0
+; CHECK-NEXT:    vdup.32 d20, r2
+; CHECK-NEXT:    add r2, r0, #48
+; CHECK-NEXT:    vbif q9, q11, q10
+; CHECK-NEXT:    vld1.64 {d30, d31}, [r2:128]
+; CHECK-NEXT:    add r2, r1, #48
+; CHECK-NEXT:    vld1.64 {d2, d3}, [r2:128]
+; CHECK-NEXT:    vmov r5, r7, d30
+; CHECK-NEXT:    vmov r2, r4, d2
+; CHECK-NEXT:    vld1.64 {d26, d27}, [r11:128]
+; CHECK-NEXT:    vld1.64 {d0, d1}, [r10:128]
+; CHECK-NEXT:    vld1.64 {d24, d25}, [r9:128]!
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r9:128]
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r8:128]!
+; CHECK-NEXT:    vmov r11, r10, d21
+; CHECK-NEXT:    subs r2, r5, r2
+; CHECK-NEXT:    sbcs r2, r7, r4
+; CHECK-NEXT:    vmov r7, r6, d31
+; CHECK-NEXT:    vmov r2, r5, d3
 ; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    movlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    mvnne r4, #0
-; CHECK-NEXT:    vdup.32 d3, r4
-; CHECK-NEXT:    vmov.32 r4, d26[1]
-; CHECK-NEXT:    subs r2, r2, r6
-; CHECK-NEXT:    sbcs r2, r7, r5
-; CHECK-NEXT:    add r5, r0, #32
+; CHECK-NEXT:    subs r2, r7, r2
+; CHECK-NEXT:    mov r7, #0
+; CHECK-NEXT:    sbcs r2, r6, r5
+; CHECK-NEXT:    vmov r6, r5, d27
+; CHECK-NEXT:    vmov r2, r9, d1
+; CHECK-NEXT:    movlt r7, #1
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    mvnne r7, #0
+; CHECK-NEXT:    vdup.32 d7, r7
+; CHECK-NEXT:    vdup.32 d6, r4
+; CHECK-NEXT:    subs r2, r6, r2
+; CHECK-NEXT:    sbcs r2, r5, r9
+; CHECK-NEXT:    vmov r6, r5, d26
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    mvnne r2, #0
+; CHECK-NEXT:    vdup.32 d5, r2
+; CHECK-NEXT:    vmov r2, r9, d0
+; CHECK-NEXT:    subs r2, r6, r2
+; CHECK-NEXT:    sbcs r2, r5, r9
 ; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    vld1.64 {d0, d1}, [r5:128]
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    vmov.32 r6, d0[0]
-; CHECK-NEXT:    vdup.32 d2, r2
+; CHECK-NEXT:    vdup.32 d4, r2
+; CHECK-NEXT:    add r2, r1, #32
+; CHECK-NEXT:    vld1.64 {d28, d29}, [r2:128]
+; CHECK-NEXT:    add r2, r0, #32
+; CHECK-NEXT:    vbif q13, q0, q2
+; CHECK-NEXT:    add r1, r1, #80
+; CHECK-NEXT:    vld1.64 {d0, d1}, [r2:128]
+; CHECK-NEXT:    vmov r4, r5, d28
+; CHECK-NEXT:    vbif q15, q1, q3
 ; CHECK-NEXT:    add r0, r0, #80
-; CHECK-NEXT:    vmov.32 r2, d26[0]
-; CHECK-NEXT:    vbit q15, q3, q1
-; CHECK-NEXT:    vmov.32 r5, d0[1]
-; CHECK-NEXT:    vmov.32 r7, d1[0]
-; CHECK-NEXT:    vld1.64 {d2, d3}, [r10:128]
-; CHECK-NEXT:    vld1.64 {d6, d7}, [r1:128]
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    vld1.64 {d2, d3}, [r8:128]
+; CHECK-NEXT:    vmov r9, r8, d25
 ; CHECK-NEXT:    vld1.64 {d8, d9}, [r0:128]
-; CHECK-NEXT:    vmov.32 r1, d7[1]
-; CHECK-NEXT:    vmov.32 r10, d19[1]
-; CHECK-NEXT:    vmov.32 lr, d6[0]
-; CHECK-NEXT:    vmov.32 r3, d8[0]
-; CHECK-NEXT:    vmov.32 r12, d8[1]
-; CHECK-NEXT:    subs r2, r6, r2
-; CHECK-NEXT:    vmov.32 r6, d1[1]
-; CHECK-NEXT:    sbcs r2, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d27[0]
-; CHECK-NEXT:    vmov.32 r4, d27[1]
+; CHECK-NEXT:    vld1.64 {d6, d7}, [r1:128]
+; CHECK-NEXT:    vmov r3, r12, d8
+; CHECK-NEXT:    subs r2, r2, r4
+; CHECK-NEXT:    sbcs r2, r6, r5
+; CHECK-NEXT:    vmov r4, r5, d29
+; CHECK-NEXT:    vmov r6, r7, d1
 ; CHECK-NEXT:    mov r2, #0
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    subs r5, r7, r5
-; CHECK-NEXT:    vmov.32 r7, d7[0]
-; CHECK-NEXT:    sbcs r4, r6, r4
-; CHECK-NEXT:    vmov.32 r6, d2[0]
+; CHECK-NEXT:    subs r4, r6, r4
+; CHECK-NEXT:    sbcs r4, r7, r5
+; CHECK-NEXT:    vmov r5, r6, d2
 ; CHECK-NEXT:    mov r4, #0
-; CHECK-NEXT:    vmov.32 r5, d2[1]
 ; CHECK-NEXT:    movlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    mvnne r4, #0
 ; CHECK-NEXT:    vdup.32 d5, r4
 ; CHECK-NEXT:    vdup.32 d4, r2
-; CHECK-NEXT:    vmov.32 r2, d20[0]
-; CHECK-NEXT:    vbit q13, q0, q2
-; CHECK-NEXT:    vmov.32 r4, d20[1]
-; CHECK-NEXT:    subs r0, r6, r2
-; CHECK-NEXT:    vmov.32 r2, d9[1]
-; CHECK-NEXT:    sbcs r0, r5, r4
-; CHECK-NEXT:    vmov.32 r4, d9[0]
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    vmov.32 r6, d18[0]
-; CHECK-NEXT:    movlt r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mvnne r0, #0
-; CHECK-NEXT:    vmov.32 r5, d18[1]
-; CHECK-NEXT:    subs r4, r4, r7
-; CHECK-NEXT:    vmov.32 r7, d21[1]
-; CHECK-NEXT:    sbcs r1, r2, r1
-; CHECK-NEXT:    vmov.32 r4, d22[1]
-; CHECK-NEXT:    vmov.32 r1, d22[0]
+; CHECK-NEXT:    vmov r2, r4, d22
+; CHECK-NEXT:    vbit q14, q0, q2
+; CHECK-NEXT:    subs r2, r5, r2
+; CHECK-NEXT:    sbcs r2, r6, r4
+; CHECK-NEXT:    vmov r4, r5, d24
+; CHECK-NEXT:    vmov r6, r7, d20
 ; CHECK-NEXT:    mov r2, #0
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    vdup.32 d11, r2
-; CHECK-NEXT:    vmov.32 r2, d3[1]
-; CHECK-NEXT:    subs r1, r6, r1
-; CHECK-NEXT:    vmov.32 r6, d21[0]
-; CHECK-NEXT:    sbcs r1, r5, r4
-; CHECK-NEXT:    vmov.32 r4, d3[0]
-; CHECK-NEXT:    vmov.32 r5, d6[1]
+; CHECK-NEXT:    subs r1, r6, r4
+; CHECK-NEXT:    vmov r0, r6, d9
+; CHECK-NEXT:    sbcs r1, r7, r5
+; CHECK-NEXT:    vmov r4, r5, d7
 ; CHECK-NEXT:    mov r1, #0
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    mvnne r1, #0
-; CHECK-NEXT:    subs r4, r4, r6
-; CHECK-NEXT:    sbcs r2, r2, r7
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    subs r0, r0, r4
+; CHECK-NEXT:    vmov r7, r4, d23
+; CHECK-NEXT:    sbcs r0, r6, r5
+; CHECK-NEXT:    vmov r5, lr, d6
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mvnne r0, #0
+; CHECK-NEXT:    vdup.32 d11, r0
+; CHECK-NEXT:    vmov r0, r6, d3
+; CHECK-NEXT:    subs r0, r0, r7
+; CHECK-NEXT:    sbcs r0, r6, r4
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    subs r4, r11, r9
 ; CHECK-NEXT:    sbcs r4, r10, r8
 ; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    subs r3, r3, lr
-; CHECK-NEXT:    sbcs r3, r12, r5
+; CHECK-NEXT:    subs r3, r3, r5
+; CHECK-NEXT:    sbcs r3, r12, lr
 ; CHECK-NEXT:    mov r3, #0
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
@@ -485,28 +432,28 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
 ; CHECK-NEXT:    vdup.32 d10, r3
 ; CHECK-NEXT:    vdup.32 d1, r4
 ; CHECK-NEXT:    vorr q2, q5, q5
-; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    vdup.32 d0, r1
+; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    vbsl q2, q4, q3
-; CHECK-NEXT:    mvnne r2, #0
-; CHECK-NEXT:    vbif q9, q11, q0
+; CHECK-NEXT:    mvnne r0, #0
+; CHECK-NEXT:    vbif q10, q12, q0
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vdup.32 d7, r2
-; CHECK-NEXT:    vdup.32 d6, r0
+; CHECK-NEXT:    vdup.32 d7, r0
 ; CHECK-NEXT:    add r0, r1, #80
-; CHECK-NEXT:    vbit q10, q1, q3
+; CHECK-NEXT:    vdup.32 d6, r2
+; CHECK-NEXT:    vbit q11, q1, q3
 ; CHECK-NEXT:    vst1.64 {d4, d5}, [r0:128]
 ; CHECK-NEXT:    add r0, r1, #32
-; CHECK-NEXT:    vst1.64 {d26, d27}, [r0:128]
+; CHECK-NEXT:    vst1.64 {d28, d29}, [r0:128]
 ; CHECK-NEXT:    add r0, r1, #48
 ; CHECK-NEXT:    vst1.64 {d30, d31}, [r0:128]
 ; CHECK-NEXT:    add r0, r1, #64
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128]!
-; CHECK-NEXT:    vst1.64 {d28, d29}, [r1:128]
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r1:128]!
+; CHECK-NEXT:    vst1.64 {d26, d27}, [r1:128]
 ; CHECK-NEXT:    mov r1, #32
-; CHECK-NEXT:    vst1.64 {d24, d25}, [r0:128], r1
-; CHECK-NEXT:    vst1.64 {d18, d19}, [r0:128]!
-; CHECK-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128], r1
+; CHECK-NEXT:    vst1.64 {d20, d21}, [r0:128]!
+; CHECK-NEXT:    vst1.64 {d22, d23}, [r0:128]
 ; CHECK-NEXT:    add sp, sp, #8
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, sp, #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 752486a8cb33e..2974db0d816b9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1298,19 +1298,21 @@ for.body:                                         ; preds = %for.body.preheader1
 define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) {
 ; CHECK-LABEL: half_short_mul:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq .LBB8_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB8_3
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    b .LBB8_6
 ; CHECK-NEXT:  .LBB8_3: @ %vector.ph
-; CHECK-NEXT:    bic r12, r3, #3
+; CHECK-NEXT:    bic r7, r3, #3
+; CHECK-NEXT:    str r7, [sp] @ 4-byte Spill
+; CHECK-NEXT:    subs r6, r7, #4
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    sub.w r6, r12, #4
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
 ; CHECK-NEXT:    mov r5, r1
@@ -1319,19 +1321,17 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q0, [r5], #8
 ; CHECK-NEXT:    ldr.w r9, [r4]
-; CHECK-NEXT:    ldr.w r8, [r4, #4]
+; CHECK-NEXT:    ldr.w r10, [r4, #4]
 ; CHECK-NEXT:    adds r4, #8
-; CHECK-NEXT:    vmov r7, s0
-; CHECK-NEXT:    vmov.16 q1[0], r7
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    vmov.16 q1[1], r7
-; CHECK-NEXT:    vmov r7, s2
-; CHECK-NEXT:    vmov.16 q1[2], r7
-; CHECK-NEXT:    vmov r7, s3
-; CHECK-NEXT:    vmov.16 q1[3], r7
-; CHECK-NEXT:    vcvt.f16.s16 q0, q1
+; CHECK-NEXT:    vmov r7, r12, d0
 ; CHECK-NEXT:    vmov.32 q1[0], r9
-; CHECK-NEXT:    vmov.32 q1[1], r8
+; CHECK-NEXT:    vmov r11, r8, d1
+; CHECK-NEXT:    vmov.16 q0[0], r7
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov.32 q1[1], r10
+; CHECK-NEXT:    vmov.16 q0[2], r11
+; CHECK-NEXT:    vmov.16 q0[3], r8
+; CHECK-NEXT:    vcvt.f16.s16 q0, q0
 ; CHECK-NEXT:    vmul.f16 q0, q1, q0
 ; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
 ; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
@@ -1340,13 +1340,14 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    vstrb.8 q1, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB8_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB8_8
 ; CHECK-NEXT:  .LBB8_6: @ %for.body.preheader13
-; CHECK-NEXT:    sub.w lr, r3, r12
-; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
-; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
-; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
+; CHECK-NEXT:    sub.w lr, r3, r7
+; CHECK-NEXT:    add.w r0, r0, r7, lsl #1
+; CHECK-NEXT:    add.w r1, r1, r7, lsl #1
+; CHECK-NEXT:    add.w r2, r2, r7, lsl #2
 ; CHECK-NEXT:  .LBB8_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r3, [r1], #2
@@ -1359,7 +1360,8 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    vstmia r2!, {s0}
 ; CHECK-NEXT:    le lr, .LBB8_7
 ; CHECK-NEXT:  .LBB8_8: @ %for.cond.cleanup
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %cmp10 = icmp eq i32 %N, 0
   br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader

diff  --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 91a9133d8eb82..98f00707df37c 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -116,51 +116,43 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-NEXT:    vadd.i32 q3, q0, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q5, q3
 ; CHECK-NEXT:    vpsel q4, q2, q1
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vmov r1, r12, d8
 ; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov r1, r12, d9
 ; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    vmov.16 q0[3], r1
 ; CHECK-NEXT:    adr r1, .LCPI2_1
 ; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    vmov.16 q0[3], r12
 ; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q5, q4
 ; CHECK-NEXT:    vpsel q5, q2, q1
-; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov r1, r12, d10
 ; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov r1, s22
-; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov r1, r12, d11
 ; CHECK-NEXT:    vdup.32 q5, r0
-; CHECK-NEXT:    vmov.16 q0[7], r1
+; CHECK-NEXT:    vmov.16 q0[6], r1
 ; CHECK-NEXT:    vcmp.u32 hi, q5, q3
+; CHECK-NEXT:    vmov.16 q0[7], r12
 ; CHECK-NEXT:    vpsel q6, q2, q1
 ; CHECK-NEXT:    vcmp.u32 hi, q5, q4
-; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov r0, r1, d12
 ; CHECK-NEXT:    vpsel q1, q2, q1
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    vmov r0, r1, d13
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s27
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[3], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q3[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q3[7], r0
 ; CHECK-NEXT:    add r0, sp, #56
-; CHECK-NEXT:    vcmp.i16 ne, q3, zr
+; CHECK-NEXT:    vmov.16 q3[7], r1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vcmp.i16 ne, q3, zr
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.i16 ne, q0, zr
@@ -201,27 +193,23 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q1
 ; CHECK-NEXT:    vpsel q0, q4, q5
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r12, d0
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.16 q2[1], r12
+; CHECK-NEXT:    vmov r1, r12, d1
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov.16 q2[3], r1
 ; CHECK-NEXT:    adr r1, .LCPI3_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vmov.16 q2[3], r12
 ; CHECK-NEXT:    vadd.i32 q3, q0, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q3
 ; CHECK-NEXT:    vpsel q0, q4, q5
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r12, d0
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.16 q2[5], r12
+; CHECK-NEXT:    vmov r1, r12, d1
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov.16 q2[7], r1
+; CHECK-NEXT:    vmov.16 q2[7], r12
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-NEXT:    vpsel q0, q4, q5
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
@@ -246,28 +234,24 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q0
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vpsel q6, q4, q5
-; CHECK-NEXT:    vmov r1, s24
+; CHECK-NEXT:    vmov r1, r12, d12
 ; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov r1, s25
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s26
+; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov r1, r12, d13
 ; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s27
-; CHECK-NEXT:    vmov.16 q0[3], r1
 ; CHECK-NEXT:    adr r1, .LCPI3_3
 ; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vmov.16 q0[3], r12
 ; CHECK-NEXT:    vadd.i32 q6, q6, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q6
 ; CHECK-NEXT:    vpsel q7, q4, q5
-; CHECK-NEXT:    vmov r1, s28
+; CHECK-NEXT:    vmov r1, r12, d14
 ; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov r1, s29
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov r1, s30
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov r1, r12, d15
 ; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov r1, s31
-; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vmov.16 q0[7], r12
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q0, q4, q5
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q1
@@ -285,27 +269,23 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov.8 q2[13], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.8 q2[14], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.8 q2[15], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q3
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vpsel q1, q4, q5
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.8 q2[15], r1
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q0, q4, q5
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
@@ -328,23 +308,19 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q0
 ; CHECK-NEXT:    vpsel q1, q4, q5
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q6
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vpsel q1, q4, q5
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q0, q4, q5
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
@@ -423,50 +399,45 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroe
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vldrw.u32 q2, [r2]
 ; CHECK-NEXT:    add.w lr, r3, r0, lsr #1
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    vand q1, q1, q0
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3[2], q3[0], r8, r8
-; CHECK-NEXT:    vmov r7, s6
+; CHECK-NEXT:    vmov q3[2], q3[0], r12, r12
+; CHECK-NEXT:    vmov r6, r7, d3
 ; CHECK-NEXT:    vand q3, q3, q0
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add.w r8, r8, #2
+; CHECK-NEXT:    add.w r12, r12, #2
+; CHECK-NEXT:    vmov r2, r3, d7
 ; CHECK-NEXT:    vmov r9, s12
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    vmov q3[2], q3[0], r9, r3
+; CHECK-NEXT:    adds r0, r2, #1
+; CHECK-NEXT:    vmov q3[2], q3[0], r9, r0
+; CHECK-NEXT:    adc r8, r3, #0
 ; CHECK-NEXT:    vand q3, q3, q0
-; CHECK-NEXT:    adc r12, r2, #0
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov r4, s15
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    subs r7, r5, r7
-; CHECK-NEXT:    vmov r7, s12
-; CHECK-NEXT:    sbcs r4, r6
-; CHECK-NEXT:    vmov r6, s13
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    vmov r3, r2, d2
+; CHECK-NEXT:    vmov r4, r5, d7
+; CHECK-NEXT:    subs r6, r4, r6
+; CHECK-NEXT:    eor.w r0, r0, r4
+; CHECK-NEXT:    sbcs r5, r7
+; CHECK-NEXT:    vmov r6, r7, d6
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    subs r2, r7, r2
-; CHECK-NEXT:    sbcs.w r0, r6, r0
-; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    movlo r5, #1
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    subs r3, r6, r3
+; CHECK-NEXT:    sbcs.w r2, r7, r2
+; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r0, r4
-; CHECK-NEXT:    eor.w r0, r5, r3
-; CHECK-NEXT:    orrs.w r0, r0, r12
+; CHECK-NEXT:    movlo r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    orrs.w r0, r0, r8
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r5
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r5
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    teq.w r7, r9
+; CHECK-NEXT:    teq.w r6, r9
 ; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll
index 4cba3ff05ed84..0893ab63e138a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-abs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll
@@ -40,19 +40,17 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
 ; CHECK-LABEL: abs_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    adds.w r1, r1, r0, asr #31
-; CHECK-NEXT:    adc.w r12, r0, r0, asr #31
-; CHECK-NEXT:    eor.w r1, r1, r0, asr #31
-; CHECK-NEXT:    adds.w r2, r2, r3, asr #31
-; CHECK-NEXT:    eor.w r0, r12, r0, asr #31
-; CHECK-NEXT:    eor.w r2, r2, r3, asr #31
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    adc.w r1, r3, r3, asr #31
-; CHECK-NEXT:    eor.w r1, r1, r3, asr #31
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r0, r0, r1, asr #31
+; CHECK-NEXT:    adc.w r12, r1, r1, asr #31
+; CHECK-NEXT:    adds.w r3, r3, r2, asr #31
+; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
+; CHECK-NEXT:    eor.w r3, r3, r2, asr #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    eor.w r0, r12, r1, asr #31
+; CHECK-NEXT:    adc.w r1, r2, r2, asr #31
+; CHECK-NEXT:    eor.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
index 5fe47320c7b0f..c44a9efe39573 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
@@ -4,26 +4,24 @@
 define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){
 ; CHECK-LABEL: ctlz_2i64_0_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add.w r2, r2, #32
-; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s6, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add.w r2, r2, #32
-; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    clzne r0, r1
+; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s4, r2
+; CHECK-NEXT:    clzne r0, r1
+; CHECK-NEXT:    vmov s4, r0
 ; CHECK-NEXT:    vldr s5, .LCPI0_0
 ; CHECK-NEXT:    vmov.f32 s7, s5
 ; CHECK-NEXT:    vmov q0, q1
@@ -70,26 +68,24 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){
 ; CHECK-LABEL: ctlz_2i64_1_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add.w r2, r2, #32
-; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s6, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add.w r2, r2, #32
-; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    clzne r0, r1
+; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r0, #32
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s4, r2
+; CHECK-NEXT:    clzne r0, r1
+; CHECK-NEXT:    vmov s4, r0
 ; CHECK-NEXT:    vldr s5, .LCPI4_0
 ; CHECK-NEXT:    vmov.f32 s7, s5
 ; CHECK-NEXT:    vmov q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
index 6a795c415d62b..f6f51068dd5f8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
@@ -5,56 +5,54 @@
 define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){
 ; CHECK-LABEL: ctpop_2i64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    mov.w r1, #1431655765
-; CHECK-NEXT:    mov.w lr, #858993459
-; CHECK-NEXT:    mov.w r4, #16843009
-; CHECK-NEXT:    and.w r2, r1, r0, lsr #1
-; CHECK-NEXT:    subs r0, r0, r2
-; CHECK-NEXT:    and.w r3, lr, r0, lsr #2
-; CHECK-NEXT:    bic r0, r0, #-858993460
-; CHECK-NEXT:    add r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    add.w r0, r0, r0, lsr #4
-; CHECK-NEXT:    bic r12, r0, #-252645136
-; CHECK-NEXT:    and.w r0, r1, r3, lsr #1
-; CHECK-NEXT:    subs r0, r3, r0
-; CHECK-NEXT:    and.w r3, lr, r0, lsr #2
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    mov.w lr, #1431655765
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    mov.w r12, #858993459
+; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    and.w r0, lr, r2, lsr #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    and.w r2, r12, r0, lsr #2
 ; CHECK-NEXT:    bic r0, r0, #-858993460
-; CHECK-NEXT:    add r0, r3
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    and.w r2, lr, r1, lsr #1
+; CHECK-NEXT:    subs r1, r1, r2
 ; CHECK-NEXT:    add.w r0, r0, r0, lsr #4
-; CHECK-NEXT:    bic r0, r0, #-252645136
-; CHECK-NEXT:    muls r0, r4, r0
-; CHECK-NEXT:    lsrs r0, r0, #24
-; CHECK-NEXT:    and.w r2, r1, r3, lsr #1
+; CHECK-NEXT:    and.w r2, r12, r1, lsr #2
+; CHECK-NEXT:    bic r1, r1, #-858993460
+; CHECK-NEXT:    add r1, r2
+; CHECK-NEXT:    and.w r2, lr, r3, lsr #1
 ; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    and.w r3, lr, r2, lsr #2
+; CHECK-NEXT:    bic r5, r0, #-252645136
+; CHECK-NEXT:    add.w r1, r1, r1, lsr #4
+; CHECK-NEXT:    mov.w r0, #16843009
+; CHECK-NEXT:    and.w r3, r12, r2, lsr #2
 ; CHECK-NEXT:    bic r2, r2, #-858993460
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    and.w r3, lr, r4, lsr #1
+; CHECK-NEXT:    subs r3, r4, r3
+; CHECK-NEXT:    bic r1, r1, #-252645136
 ; CHECK-NEXT:    add.w r2, r2, r2, lsr #4
+; CHECK-NEXT:    muls r5, r0, r5
+; CHECK-NEXT:    and.w r4, r12, r3, lsr #2
+; CHECK-NEXT:    bic r3, r3, #-858993460
 ; CHECK-NEXT:    bic r2, r2, #-252645136
-; CHECK-NEXT:    muls r2, r4, r2
+; CHECK-NEXT:    add r3, r4
+; CHECK-NEXT:    muls r1, r0, r1
+; CHECK-NEXT:    add.w r3, r3, r3, lsr #4
+; CHECK-NEXT:    muls r2, r0, r2
+; CHECK-NEXT:    bic r3, r3, #-252645136
+; CHECK-NEXT:    muls r0, r3, r0
+; CHECK-NEXT:    lsrs r1, r1, #24
+; CHECK-NEXT:    add.w r1, r1, r5, lsr #24
 ; CHECK-NEXT:    lsrs r2, r2, #24
-; CHECK-NEXT:    and.w r1, r1, r3, lsr #1
-; CHECK-NEXT:    subs r1, r3, r1
-; CHECK-NEXT:    and.w r3, lr, r1, lsr #2
-; CHECK-NEXT:    bic r1, r1, #-858993460
-; CHECK-NEXT:    add r1, r3
-; CHECK-NEXT:    mul r3, r12, r4
-; CHECK-NEXT:    add.w r1, r1, r1, lsr #4
-; CHECK-NEXT:    bic r1, r1, #-252645136
-; CHECK-NEXT:    muls r1, r4, r1
-; CHECK-NEXT:    add.w r0, r0, r3, lsr #24
-; CHECK-NEXT:    vmov s2, r0
-; CHECK-NEXT:    add.w r0, r2, r1, lsr #24
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    add.w r0, r2, r0, lsr #24
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vmov.f32 s3, s1
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI0_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-cttz.ll b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
index 73103f2843ea7..b844bc217e571 100644
--- a/llvm/test/CodeGen/Thumb2/mve-cttz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
@@ -5,30 +5,28 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_0_t:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    rbit r2, r2
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    adds r2, #32
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r1, #32
 ; CHECK-NEXT:    rbit r0, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s2, r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    rbit r2, r2
+; CHECK-NEXT:    clzne r1, r0
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    adds r2, #32
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r1, #32
 ; CHECK-NEXT:    rbit r0, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s0, r2
+; CHECK-NEXT:    clzne r1, r0
+; CHECK-NEXT:    vmov s0, r1
 ; CHECK-NEXT:    vldr s1, .LCPI0_0
 ; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
@@ -81,30 +79,28 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_1_t:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    rbit r2, r2
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    adds r2, #32
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r1, #32
 ; CHECK-NEXT:    rbit r0, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s2, r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    rbit r2, r2
+; CHECK-NEXT:    clzne r1, r0
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    adds r2, #32
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    adds r1, #32
 ; CHECK-NEXT:    rbit r0, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    clzne r2, r0
-; CHECK-NEXT:    vmov s0, r2
+; CHECK-NEXT:    clzne r1, r0
+; CHECK-NEXT:    vmov s0, r1
 ; CHECK-NEXT:    vldr s1, .LCPI4_0
 ; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index 4fcacf945a5c1..3a746fc749feb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -5,22 +5,19 @@
 define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: udiv_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    udiv r1, r2, r1
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    udiv r1, r2, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, lr, d1
+; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    udiv r0, r2, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    udiv r1, r4, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    udiv r0, lr, r12
+; CHECK-NEXT:    udiv r1, r5, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = udiv <4 x i32> %in1, %in2
   ret <4 x i32> %out
@@ -29,22 +26,19 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: sdiv_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sdiv r1, r2, r1
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    sdiv r1, r2, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, lr, d1
+; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    sdiv r0, r2, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    sdiv r1, r4, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    sdiv r0, lr, r12
+; CHECK-NEXT:    sdiv r1, r5, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = sdiv <4 x i32> %in1, %in2
   ret <4 x i32> %out
@@ -53,27 +47,23 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: urem_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    udiv r2, r1, r0
-; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    udiv r3, r2, r1
-; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    udiv r0, r3, r2
-; CHECK-NEXT:    mls r0, r0, r2, r3
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    udiv r1, r3, r2
-; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r1, lr, d2
+; CHECK-NEXT:    udiv r4, r2, r0
+; CHECK-NEXT:    mls r0, r4, r0, r2
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    udiv r5, r2, r1
+; CHECK-NEXT:    mls r1, r5, r1, r2
+; CHECK-NEXT:    udiv r2, r3, r12
+; CHECK-NEXT:    mls r2, r2, r12, r3
+; CHECK-NEXT:    udiv r3, r4, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    mls r3, r3, lr, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = urem <4 x i32> %in1, %in2
   ret <4 x i32> %out
@@ -82,27 +72,23 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: srem_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    sdiv r2, r1, r0
-; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sdiv r3, r2, r1
-; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    sdiv r0, r3, r2
-; CHECK-NEXT:    mls r0, r0, r2, r3
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    sdiv r1, r3, r2
-; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r1, lr, d2
+; CHECK-NEXT:    sdiv r4, r2, r0
+; CHECK-NEXT:    mls r0, r4, r0, r2
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    sdiv r5, r2, r1
+; CHECK-NEXT:    mls r1, r5, r1, r2
+; CHECK-NEXT:    sdiv r2, r3, r12
+; CHECK-NEXT:    mls r2, r2, r12, r3
+; CHECK-NEXT:    sdiv r3, r4, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    mls r3, r3, lr, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = srem <4 x i32> %in1, %in2
   ret <4 x i32> %out
@@ -637,17 +623,13 @@ define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_uldivmod
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_uldivmod
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
@@ -667,17 +649,13 @@ define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
@@ -697,17 +675,13 @@ define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_uldivmod
 ; CHECK-NEXT:    mov r4, r2
 ; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_uldivmod
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
@@ -727,17 +701,13 @@ define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    mov r4, r2
 ; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
@@ -774,24 +744,22 @@ define arm_aapcs_vfpcc <4 x float> @frem_f32(<4 x float> %in1, <4 x float> %in2)
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov r0, r4, d11
+; CHECK-NEXT:    vmov r1, r5, d9
 ; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vmov r4, r2, d10
+; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov r5, s20
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r6
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index 639c1116a9ab6..d75025c012072 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -74,19 +74,18 @@ define arm_aapcs_vfpcc <4 x float> @cos_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -186,19 +185,18 @@ define arm_aapcs_vfpcc <4 x float> @sin_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl sinf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl sinf
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -298,19 +296,18 @@ define arm_aapcs_vfpcc <4 x float> @exp_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl expf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl expf
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -410,19 +407,18 @@ define arm_aapcs_vfpcc <4 x float> @exp2_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl exp2f
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl exp2f
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -522,19 +518,18 @@ define arm_aapcs_vfpcc <4 x float> @log_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl logf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl logf
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -634,19 +629,18 @@ define arm_aapcs_vfpcc <4 x float> @log2_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl log2f
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log2f
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -746,19 +740,18 @@ define arm_aapcs_vfpcc <4 x float> @log10_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, r4, d9
 ; CHECK-NEXT:    bl log10f
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log10f
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r5
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -859,24 +852,22 @@ define arm_aapcs_vfpcc <4 x float> @pow_float32_t(<4 x float> %src1, <4 x float>
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov r0, r4, d11
+; CHECK-NEXT:    vmov r1, r5, d9
 ; CHECK-NEXT:    bl powf
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl powf
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vmov r4, r2, d10
+; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    vmov r5, s20
-; CHECK-NEXT:    vmov s18, r4
+; CHECK-NEXT:    vmov s18, r6
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s17, r0
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov q0, q4
@@ -993,26 +984,22 @@ define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x f
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r12, s7
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    lsrs r0, r0, #31
-; CHECK-NEXT:    bfi r3, r0, #31, #1
-; CHECK-NEXT:    lsr.w r0, lr, #31
-; CHECK-NEXT:    bfi r2, r0, #31, #1
-; CHECK-NEXT:    lsr.w r0, r12, #31
-; CHECK-NEXT:    bfi r1, r0, #31, #1
-; CHECK-NEXT:    vmov s3, r1
-; CHECK-NEXT:    lsrs r0, r4, #31
-; CHECK-NEXT:    vmov s2, r2
-; CHECK-NEXT:    bfi r5, r0, #31, #1
-; CHECK-NEXT:    vmov s1, r3
-; CHECK-NEXT:    vmov s0, r5
+; CHECK-NEXT:    vmov r12, r1, d2
+; CHECK-NEXT:    vmov r2, lr, d3
+; CHECK-NEXT:    vmov r3, r0, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    lsrs r1, r1, #31
+; CHECK-NEXT:    bfi r0, r1, #31, #1
+; CHECK-NEXT:    lsrs r1, r2, #31
+; CHECK-NEXT:    bfi r4, r1, #31, #1
+; CHECK-NEXT:    lsr.w r1, lr, #31
+; CHECK-NEXT:    bfi r5, r1, #31, #1
+; CHECK-NEXT:    lsr.w r1, r12, #31
+; CHECK-NEXT:    bfi r3, r1, #31, #1
+; CHECK-NEXT:    vmov s3, r5
+; CHECK-NEXT:    vmov s2, r4
+; CHECK-NEXT:    vmov s1, r0
+; CHECK-NEXT:    vmov s0, r3
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %src1, <4 x float> %src2)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index c4f68959ecf45..eb08826311747 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -41,31 +41,27 @@ define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(i16* noalias nocapture re
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov r0, r2, d1
+; CHECK-NEXT:    vmov r1, lr, d2
+; CHECK-NEXT:    vmov r3, r12, d3
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r12
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
   %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1
@@ -111,61 +107,53 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(i8* noalias nocapture rea
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i32 q4, #0x10
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, q4
+; CHECK-NEXT:    vadd.i32 q2, q2, q4
+; CHECK-NEXT:    vmov r1, r2, d7
+; CHECK-NEXT:    vmov r3, r4, d6
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vadd.i32 q3, q0, q4
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, q4
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vadd.i32 q2, q2, q4
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    vadd.i32 q3, q0, q4
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r6, s15
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r4]
+; CHECK-NEXT:    ldrb r4, [r5]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    vmov r5, r6, d6
+; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    vmov.8 q0[0], r5
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r5, [r6]
 ; CHECK-NEXT:    vmov.8 q0[1], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[2], r5
-; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov r5, r6, d7
+; CHECK-NEXT:    ldrb r0, [r5]
+; CHECK-NEXT:    ldrb r6, [r6]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov r0, r5, d2
 ; CHECK-NEXT:    vmov.8 q0[3], r6
-; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.8 q0[5], r5
+; CHECK-NEXT:    vmov r0, r5, d3
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r0, r5, d4
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r2
-; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
 ; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r1
-; CHECK-NEXT:    vmov.8 q0[14], r12
-; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
   %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -571,12 +559,12 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    blt .LBB11_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    bic r8, r2, #7
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    sub.w r6, r8, #8
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    sub.w r12, r8, #8
 ; CHECK-NEXT:    vmov.i16 q1, #0x8
-; CHECK-NEXT:    add.w r1, r5, r6, lsr #3
-; CHECK-NEXT:    adr r6, .LCPI11_0
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    add.w r1, r4, r12, lsr #3
+; CHECK-NEXT:    adr r4, .LCPI11_0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB11_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -600,37 +588,33 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
 ; CHECK-NEXT:    vshl.i32 q4, q4, #1
 ; CHECK-NEXT:    vmov.u16 r5, q2[3]
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vmov.u16 r6, q2[1]
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
+; CHECK-NEXT:    vmov r5, r6, d9
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    vmov r3, r7, d8
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
 ; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r12, s13
-; CHECK-NEXT:    ldrh.w r11, [r3]
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    ldrh r7, [r7]
-; CHECK-NEXT:    ldrh.w r9, [r5]
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    ldrh.w r10, [r6]
-; CHECK-NEXT:    vmov r6, s19
-; CHECK-NEXT:    ldrh.w r1, [r12]
+; CHECK-NEXT:    vmov r9, r10, d7
+; CHECK-NEXT:    ldrh.w r12, [r5]
+; CHECK-NEXT:    vmov r5, r1, d6
+; CHECK-NEXT:    ldrh.w r11, [r6]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q3[0], r3
-; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.16 q3[2], r10
+; CHECK-NEXT:    ldrh r7, [r7]
+; CHECK-NEXT:    ldrh.w r6, [r9]
+; CHECK-NEXT:    ldrh.w r10, [r10]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q3[3], r9
-; CHECK-NEXT:    ldrh r6, [r6]
-; CHECK-NEXT:    vmov.16 q3[4], r11
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q3[0], r5
+; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    vmov.16 q3[2], r6
+; CHECK-NEXT:    vmov.16 q3[3], r10
+; CHECK-NEXT:    vmov.16 q3[4], r3
 ; CHECK-NEXT:    vmov.16 q3[5], r7
-; CHECK-NEXT:    vmov.16 q3[6], r5
-; CHECK-NEXT:    vmov.16 q3[7], r6
+; CHECK-NEXT:    vmov.16 q3[6], r12
+; CHECK-NEXT:    vmov.16 q3[7], r11
 ; CHECK-NEXT:    vstrb.8 q3, [r4], #16
 ; CHECK-NEXT:    le lr, .LBB11_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
@@ -694,184 +678,172 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #104
-; CHECK-NEXT:    sub sp, #104
+; CHECK-NEXT:    .pad #72
+; CHECK-NEXT:    sub sp, #72
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #64] @ 4-byte Spill
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    str r2, [sp, #68] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB12_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r1, r2, #7
+; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
 ; CHECK-NEXT:    adr r6, .LCPI12_2
-; CHECK-NEXT:    sub.w r3, r1, #8
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
 ; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    bic r1, r1, #7
+; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    sub.w r3, r1, #8
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q0, #0x18
 ; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
 ; CHECK-NEXT:    adr r3, .LCPI12_0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
 ; CHECK-NEXT:    adr r7, .LCPI12_1
-; CHECK-NEXT:    vmov.i16 q3, #0x18
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [r7]
-; CHECK-NEXT:    str r1, [sp, #52] @ 4-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r7]
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    ldr.w r12, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
 ; CHECK-NEXT:  .LBB12_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r3, q5[2]
-; CHECK-NEXT:    vmov.u16 r5, q5[0]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[3]
-; CHECK-NEXT:    vmov.u16 r5, q5[1]
-; CHECK-NEXT:    vmov.u16 r7, q7[6]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    vmov.u16 r5, q5[4]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov.u16 r12, q7[4]
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vmov.u16 r1, q7[5]
-; CHECK-NEXT:    vadd.i32 q2, q0, r0
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r6, s11
-; CHECK-NEXT:    ldrh.w r9, [r3]
 ; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
+; CHECK-NEXT:    vmov.u16 r5, q5[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[7]
 ; CHECK-NEXT:    vmov.u16 r5, q5[5]
-; CHECK-NEXT:    ldrh r6, [r6]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    ldrh.w r10, [r3]
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh.w r11, [r3]
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r12, r7
-; CHECK-NEXT:    vmov.u16 r7, q7[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r7
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    ldrh.w r8, [r3]
-; CHECK-NEXT:    vmov.u16 r3, q6[0]
-; CHECK-NEXT:    ldrh r7, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q6[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r4, q6[2]
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
+; CHECK-NEXT:    vmov.u16 r1, q6[0]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmov.u16 r6, q5[0]
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r7, r5, d3
+; CHECK-NEXT:    vmov r3, r8, d2
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r4
 ; CHECK-NEXT:    vmov.u16 r1, q6[3]
-; CHECK-NEXT:    vmov.u16 r3, q6[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q6[6]
-; CHECK-NEXT:    vmov.u16 r3, q6[4]
+; CHECK-NEXT:    vmov.u16 r4, q6[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r1
+; CHECK-NEXT:    vmov.u16 r4, q5[2]
+; CHECK-NEXT:    vmov q2[2], q2[0], r6, r4
+; CHECK-NEXT:    vmov.u16 r4, q5[3]
+; CHECK-NEXT:    vmov.u16 r6, q5[1]
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q6[7]
-; CHECK-NEXT:    vmov.u16 r3, q6[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r6, r4
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q7[2]
-; CHECK-NEXT:    vmov.u16 r3, q7[0]
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q7[3]
-; CHECK-NEXT:    vmov.u16 r3, q7[1]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vmov r1, r11, d3
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vmov.u16 r6, q6[4]
+; CHECK-NEXT:    vadd.i16 q5, q5, q0
+; CHECK-NEXT:    ldrh.w r10, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    ldrh r2, [r3]
+; CHECK-NEXT:    ldrh.w r9, [r7]
+; CHECK-NEXT:    vmov.u16 r7, q4[4]
+; CHECK-NEXT:    ldrh.w r8, [r8]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov.16 q7[0], r4
+; CHECK-NEXT:    ldrh r4, [r5]
+; CHECK-NEXT:    vmov.16 q7[1], r4
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov.16 q1[0], r4
+; CHECK-NEXT:    ldrh r4, [r5]
+; CHECK-NEXT:    vmov.u16 r5, q6[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
+; CHECK-NEXT:    vmov.u16 r5, q6[7]
+; CHECK-NEXT:    vmov.u16 r6, q6[5]
+; CHECK-NEXT:    vmov.16 q1[1], r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
+; CHECK-NEXT:    vmov.16 q1[2], r1
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    ldrh.w r5, [r11]
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
+; CHECK-NEXT:    vadd.i16 q6, q6, q0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
+; CHECK-NEXT:    vmov.16 q1[3], r5
+; CHECK-NEXT:    vmov r1, r4, d6
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov.16 q1[4], r1
+; CHECK-NEXT:    vmov r1, r3, d7
+; CHECK-NEXT:    vmov.16 q1[5], r4
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vmov.16 q1[2], r9
-; CHECK-NEXT:    vmov.16 q1[3], r6
-; CHECK-NEXT:    vmov.16 q1[4], r10
-; CHECK-NEXT:    vmov.16 q1[5], r11
-; CHECK-NEXT:    vmov.16 q1[6], r8
-; CHECK-NEXT:    vmov.16 q1[7], r5
+; CHECK-NEXT:    vmov.16 q1[6], r1
+; CHECK-NEXT:    vmov r1, r4, d5
+; CHECK-NEXT:    ldrh r6, [r1]
+; CHECK-NEXT:    ldrh r1, [r3]
+; CHECK-NEXT:    vmov.u16 r3, q4[2]
+; CHECK-NEXT:    ldrh r5, [r4]
+; CHECK-NEXT:    vmov.u16 r4, q4[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q4[3]
+; CHECK-NEXT:    vmov.u16 r4, q4[1]
+; CHECK-NEXT:    vmov.16 q1[7], r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r4, q4[6]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmov.16 q7[2], r6
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vmov.16 q7[3], r5
+; CHECK-NEXT:    vadd.i32 q3, q2, r0
+; CHECK-NEXT:    vmov.16 q7[4], r2
+; CHECK-NEXT:    vmov r1, r3, d6
+; CHECK-NEXT:    vmov.16 q7[5], r8
+; CHECK-NEXT:    vmov.16 q7[6], r9
+; CHECK-NEXT:    vmov.16 q7[7], r10
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r1, [r3]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov r1, r3, d7
+; CHECK-NEXT:    vmov q3[2], q3[0], r7, r4
+; CHECK-NEXT:    vmov.u16 r4, q4[7]
+; CHECK-NEXT:    vmov.u16 r7, q4[5]
+; CHECK-NEXT:    vadd.i16 q4, q4, q0
+; CHECK-NEXT:    vmov q3[3], q3[1], r7, r4
+; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    vshl.i32 q3, q3, #1
+; CHECK-NEXT:    vadd.i32 q3, q3, r0
 ; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov.16 q2[3], r3
+; CHECK-NEXT:    vmov r1, r3, d6
 ; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.16 q2[5], r3
+; CHECK-NEXT:    vmov r1, r3, d7
 ; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i16 q6, q6, q3
-; CHECK-NEXT:    vadd.i16 q5, q5, q3
-; CHECK-NEXT:    vadd.i16 q7, q7, q3
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov r1, s17
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    vmov.16 q0[6], r7
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:    vadd.i16 q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vstrb.8 q0, [r4], #16
+; CHECK-NEXT:    vmov.16 q2[7], r3
+; CHECK-NEXT:    vadd.i16 q1, q2, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, q7
+; CHECK-NEXT:    vstrb.8 q1, [r12], #16
 ; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    bne.w .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #104
+; CHECK-NEXT:    add sp, #72
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -957,27 +929,23 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(i8* noalias nocapture read
 ; CHECK-NEXT:    .pad #328
 ; CHECK-NEXT:    sub sp, #328
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    str r1, [sp, #120] @ 4-byte Spill
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    str r2, [sp, #124] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #124] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB13_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    ldr r1, [sp, #124] @ 4-byte Reload
-; CHECK-NEXT:    adr.w r6, .LCPI13_8
-; CHECK-NEXT:    adr.w r7, .LCPI13_7
-; CHECK-NEXT:    adr.w r3, .LCPI13_6
-; CHECK-NEXT:    bic r11, r1, #7
 ; CHECK-NEXT:    adr r1, .LCPI13_0
+; CHECK-NEXT:    adr r6, .LCPI13_8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_1
-; CHECK-NEXT:    vmov.i32 q5, #0x30
-; CHECK-NEXT:    str.w r11, [sp, #116] @ 4-byte Spill
+; CHECK-NEXT:    adr r7, .LCPI13_7
+; CHECK-NEXT:    adr r3, .LCPI13_6
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_5
+; CHECK-NEXT:    bic r10, r2, #7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr.w r6, .LCPI13_9
+; CHECK-NEXT:    adr r6, .LCPI13_9
+; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -990,243 +958,222 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(i8* noalias nocapture read
 ; CHECK-NEXT:  .LBB13_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adr r1, .LCPI13_3
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_4
-; CHECK-NEXT:    vstrw.32 q2, [sp, #288] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
+; CHECK-NEXT:    vldrw.u32 q5, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #224] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_10
-; CHECK-NEXT:    vstrw.32 q2, [sp, #272] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #304] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #288] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #304] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI13_11
-; CHECK-NEXT:    ldr.w r9, [sp, #120] @ 4-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [sp, #208] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [r1]
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [sp, #192] @ 16-byte Spill
+; CHECK-NEXT:    ldr.w r8, [sp, #124] @ 4-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #256] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q6, [sp, #272] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #224] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #208] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    mov r11, r10
+; CHECK-NEXT:    vstrw.32 q6, [sp, #240] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #192] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB13_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #240] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q3, q6, r0
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vstrw.32 q1, [sp, #256] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #176] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q0, q7, r0
-; CHECK-NEXT:    vstrw.32 q6, [sp, #160] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #256] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q4, q1, r0
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #240] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q6, q6, r0
-; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, lr, d8
+; CHECK-NEXT:    vadd.i32 q7, q7, r0
+; CHECK-NEXT:    vmov r5, r4, d15
+; CHECK-NEXT:    vadd.i32 q6, q0, r0
+; CHECK-NEXT:    vmov r6, r7, d13
+; CHECK-NEXT:    vstrw.32 q1, [sp, #160] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #304] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #176] @ 16-byte Spill
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vstrw.32 q1, [sp, #304] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #224] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q5, [sp, #128] @ 16-byte Spill
 ; CHECK-NEXT:    subs.w r11, r11, #16
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    ldrb.w r9, [r1]
+; CHECK-NEXT:    vmov r1, r3, d14
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r6, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    ldrb.w r10, [r1]
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ldrb r4, [r1]
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    ldrb.w r8, [r1]
-; CHECK-NEXT:    vmov r1, s24
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
-; CHECK-NEXT:    vmov r1, s25
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[1], r1
-; CHECK-NEXT:    vmov r1, s26
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[2], r1
-; CHECK-NEXT:    vmov r1, s27
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov.8 q7[4], r6
+; CHECK-NEXT:    vmov r1, r3, d12
+; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    ldrb r6, [r4]
+; CHECK-NEXT:    vmov.8 q7[3], r6
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q6[0], r1
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[1], r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vadd.i32 q3, q4, r0
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #224] @ 16-byte Reload
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    vmov r6, r1, d2
+; CHECK-NEXT:    vmov.8 q6[1], r3
+; CHECK-NEXT:    vmov.8 q6[2], r5
+; CHECK-NEXT:    vmov.8 q6[3], r7
+; CHECK-NEXT:    ldrb.w r7, [lr]
+; CHECK-NEXT:    vmov.8 q6[4], r9
+; CHECK-NEXT:    vmov.8 q6[5], r7
+; CHECK-NEXT:    ldrb r4, [r1]
+; CHECK-NEXT:    vmov r1, r5, d3
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #256] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q0, q1, r0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #256] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #240] @ 16-byte Reload
+; CHECK-NEXT:    ldrb.w r12, [r1]
+; CHECK-NEXT:    vmov r1, r3, d9
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #192] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[2], r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.8 q6[3], r12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #304] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #304] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov.8 q6[6], r1
+; CHECK-NEXT:    vmov r1, r7, d0
+; CHECK-NEXT:    vmov.8 q6[7], r3
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r7, [r7]
+; CHECK-NEXT:    vmov.8 q7[4], r1
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #272] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[5], r7
+; CHECK-NEXT:    vstrw.32 q0, [sp, #272] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[4], r1
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    vmov.8 q6[5], lr
-; CHECK-NEXT:    vmov.8 q6[6], r8
-; CHECK-NEXT:    vmov.8 q6[7], r5
-; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    ldrb r1, [r6]
+; CHECK-NEXT:    vmov r7, r6, d0
+; CHECK-NEXT:    vmov.8 q7[7], r3
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #288] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[8], r1
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #288] @ 16-byte Spill
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    ldrb r7, [r1]
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #208] @ 16-byte Reload
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q7[5], r5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov.8 q7[6], r10
-; CHECK-NEXT:    vmov.8 q7[7], r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.8 q7[8], r2
+; CHECK-NEXT:    vmov.8 q7[9], r4
+; CHECK-NEXT:    vmov r4, r1, d0
+; CHECK-NEXT:    vmov.8 q7[10], r12
+; CHECK-NEXT:    vmov.8 q7[11], r5
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    vmov.8 q7[9], r7
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[10], r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.8 q7[11], r3
-; CHECK-NEXT:    vmov.8 q7[12], r6
-; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.8 q7[13], r5
-; CHECK-NEXT:    vmov.8 q7[14], r4
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[8], r1
-; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q6[8], r4
+; CHECK-NEXT:    vmov r5, r4, d1
 ; CHECK-NEXT:    vmov.8 q6[9], r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[10], r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vadd.i32 q1, q2, r0
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #192] @ 16-byte Reload
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[11], r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[12], r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[13], r1
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vadd.i32 q0, q5, r0
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #208] @ 16-byte Reload
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov.8 q6[10], r5
+; CHECK-NEXT:    vmov.8 q6[11], r4
+; CHECK-NEXT:    vmov.8 q6[12], r7
+; CHECK-NEXT:    vmov.8 q6[13], r6
+; CHECK-NEXT:    vmov.8 q6[14], r3
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[14], r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov.8 q7[12], r1
+; CHECK-NEXT:    ldrb r1, [r3]
+; CHECK-NEXT:    vmov.8 q7[13], r1
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q1, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #240] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #256] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q5
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[15], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vadd.i32 q0, q4, r0
-; CHECK-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vstrw.32 q4, [sp, #224] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #256] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #160] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q7[14], r1
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vadd.i8 q6, q7, q6
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb.w r1, [lr]
+; CHECK-NEXT:    vmov.8 q6[15], r1
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vadd.i8 q6, q6, q7
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov.8 q7[1], r2
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q3, r0
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #224] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #304] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #304] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #288] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #288] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #272] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #272] @ 16-byte Spill
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[2], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #272] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #272] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #272] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-NEXT:    vstrw.32 q4, [sp, #272] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #304] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-NEXT:    vstrw.32 q4, [sp, #304] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[4], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[5], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q5, r0
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vstrw.32 q5, [sp, #208] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[6], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vadd.i32 q0, q3, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, q5
-; CHECK-NEXT:    vstrw.32 q3, [sp, #208] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #240] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q5
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[7], r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[8], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[9], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vadd.i32 q0, q4, r0
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vstrw.32 q4, [sp, #192] @ 16-byte Spill
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[10], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vadd.i32 q0, q2, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, q5
-; CHECK-NEXT:    vstrw.32 q2, [sp, #192] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #288] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q2, q2, q5
-; CHECK-NEXT:    vstrw.32 q2, [sp, #288] @ 16-byte Spill
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[11], r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[12], r1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, r3, d1
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[14], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
 ; CHECK-NEXT:    vadd.i8 q0, q6, q7
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #160] @ 16-byte Reload
-; CHECK-NEXT:    vstrb.8 q0, [r9], #16
+; CHECK-NEXT:    vstrb.8 q0, [r8], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q5
-; CHECK-NEXT:    vadd.i32 q6, q6, q5
-; CHECK-NEXT:    vadd.i32 q0, q0, q5
+; CHECK-NEXT:    vadd.i32 q7, q7, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    bne.w .LBB13_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #124] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r11, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT:    cmp r11, r1
+; CHECK-NEXT:    cmp r10, r2
 ; CHECK-NEXT:    bne.w .LBB13_2
 ; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #328
@@ -1347,6 +1294,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture reado
 ; CHECK-NEXT:    .pad #72
 ; CHECK-NEXT:    sub sp, #72
 ; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    str r1, [sp, #68] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB14_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    adr r5, .LCPI14_3
@@ -1354,7 +1302,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture reado
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    adr r6, .LCPI14_2
 ; CHECK-NEXT:    adr r3, .LCPI14_0
-; CHECK-NEXT:    bic r12, r2, #7
+; CHECK-NEXT:    bic r1, r2, #7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    vmov.i32 q4, #0x10
@@ -1366,77 +1314,69 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture reado
 ; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
+; CHECK-NEXT:    ldr.w lr, [sp, #68] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    mov lr, r1
-; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:  .LBB14_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q1, q7, r0
-; CHECK-NEXT:    vadd.i32 q2, q0, r0
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vadd.i32 q3, q5, r0
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    subs r3, #16
-; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vadd.i32 q2, q6, r0
+; CHECK-NEXT:    vadd.i32 q1, q5, r0
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    subs.w r8, r8, #16
+; CHECK-NEXT:    vmov r3, r9, d4
+; CHECK-NEXT:    vadd.i32 q2, q7, r0
 ; CHECK-NEXT:    vadd.i32 q5, q5, q4
+; CHECK-NEXT:    vadd.i32 q6, q6, q4
 ; CHECK-NEXT:    vadd.i32 q7, q7, q4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q4
-; CHECK-NEXT:    ldrb.w r8, [r4]
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w r10, [r4]
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    ldrb.w r9, [r4]
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    ldrb.w r11, [r4]
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    vmov.8 q2[0], r6
-; CHECK-NEXT:    vmov r6, s13
+; CHECK-NEXT:    ldrb.w r11, [r6]
+; CHECK-NEXT:    ldrb.w r10, [r7]
+; CHECK-NEXT:    vmov r6, r7, d2
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    vmov.8 q2[1], r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    vmov.8 q2[2], r6
-; CHECK-NEXT:    vmov r6, s15
-; CHECK-NEXT:    vadd.i32 q3, q6, r0
-; CHECK-NEXT:    vadd.i32 q6, q6, q4
-; CHECK-NEXT:    vmov r7, s12
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w r9, [r9]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q2[3], r6
-; CHECK-NEXT:    vmov r6, s5
-; CHECK-NEXT:    vmov.8 q2[4], r7
-; CHECK-NEXT:    vmov r7, s13
+; CHECK-NEXT:    vmov.8 q1[0], r6
+; CHECK-NEXT:    vmov.8 q1[1], r7
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vmov.8 q1[2], r4
+; CHECK-NEXT:    vmov.8 q1[3], r5
+; CHECK-NEXT:    vmov.8 q1[4], r3
+; CHECK-NEXT:    vmov.8 q1[5], r9
+; CHECK-NEXT:    vmov.8 q1[6], r11
+; CHECK-NEXT:    vmov.8 q1[7], r10
+; CHECK-NEXT:    ldrb.w r12, [r7]
+; CHECK-NEXT:    vmov r5, r7, d7
 ; CHECK-NEXT:    ldrb r6, [r6]
+; CHECK-NEXT:    ldrb.w r9, [r7]
+; CHECK-NEXT:    vmov r7, r3, d6
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb.w r11, [r3]
+; CHECK-NEXT:    vmov r3, r4, d4
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q2[5], r7
-; CHECK-NEXT:    vmov r7, s14
-; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q2[6], r7
-; CHECK-NEXT:    vmov r7, s15
-; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q2[7], r7
-; CHECK-NEXT:    vmov r7, s4
-; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q2[8], r7
-; CHECK-NEXT:    vmov.8 q2[9], r6
-; CHECK-NEXT:    vmov.8 q2[10], r8
-; CHECK-NEXT:    vmov.8 q2[11], r10
-; CHECK-NEXT:    vmov.8 q2[12], r5
-; CHECK-NEXT:    vmov.8 q2[13], r9
-; CHECK-NEXT:    vmov.8 q2[14], r11
-; CHECK-NEXT:    vmov.8 q2[15], r4
-; CHECK-NEXT:    vstrb.8 q2, [lr], #16
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov.8 q1[8], r3
+; CHECK-NEXT:    vmov.8 q1[9], r4
+; CHECK-NEXT:    vmov.8 q1[10], r6
+; CHECK-NEXT:    vmov.8 q1[11], r12
+; CHECK-NEXT:    vmov.8 q1[12], r7
+; CHECK-NEXT:    vmov.8 q1[13], r11
+; CHECK-NEXT:    vmov.8 q1[14], r5
+; CHECK-NEXT:    vmov.8 q1[15], r9
+; CHECK-NEXT:    vstrb.8 q1, [lr], #16
 ; CHECK-NEXT:    bne .LBB14_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
-; CHECK-NEXT:    cmp r12, r2
+; CHECK-NEXT:    cmp r1, r2
 ; CHECK-NEXT:    bne .LBB14_2
 ; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #72

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index df6fc9412a108..37e4122ac012c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -49,36 +49,32 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i16>* %offp
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrh.s32 q0, [r1, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrh.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r12, [r12]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
@@ -94,29 +90,25 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vldr.16 s8, [r2]
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vldr.16 s8, [r3]
 ; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vldr.16 s4, [r3]
+; CHECK-NEXT:    vmov r2, r3, d3
 ; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldr.16 s4, [r3]
 ; CHECK-NEXT:    vldr.16 s1, [r2]
 ; CHECK-NEXT:    vins.f16 s1, s4
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
@@ -259,39 +251,35 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    vmov.i32 q1, #0x28
-; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vshl.i32 q2, q2, #1
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vadd.i32 q1, q2, q1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrh.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vmov.i32 q0, #0x28
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vmov r2, r12, d2
+; CHECK-NEXT:    vmov r3, lr, d3
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r12, [r12]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
@@ -378,51 +366,47 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) {
 ; CHECK-LABEL: scaled_v8i16_i16_biggep3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI17_0
-; CHECK-NEXT:    adr.w r12, .LCPI17_1
+; CHECK-NEXT:    adr r2, .LCPI17_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh r6, [r2]
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, lr, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r2, d1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w lr, [lr]
 ; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], lr
-; CHECK-NEXT:    vmov.16 q0[3], r6
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI17_0:
-; CHECK-NEXT:    .long 131072 @ 0x20000
-; CHECK-NEXT:    .long 131078 @ 0x20006
-; CHECK-NEXT:    .long 131084 @ 0x2000c
-; CHECK-NEXT:    .long 131090 @ 0x20012
-; CHECK-NEXT:  .LCPI17_1:
 ; CHECK-NEXT:    .long 131096 @ 0x20018
 ; CHECK-NEXT:    .long 131102 @ 0x2001e
 ; CHECK-NEXT:    .long 131108 @ 0x20024
 ; CHECK-NEXT:    .long 131114 @ 0x2002a
+; CHECK-NEXT:  .LCPI17_1:
+; CHECK-NEXT:    .long 131072 @ 0x20000
+; CHECK-NEXT:    .long 131078 @ 0x20006
+; CHECK-NEXT:    .long 131084 @ 0x2000c
+; CHECK-NEXT:    .long 131090 @ 0x20012
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536
@@ -433,51 +417,47 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) {
 ; CHECK-LABEL: scaled_v8i16_i16_biggep4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI18_0
-; CHECK-NEXT:    adr.w r12, .LCPI18_1
+; CHECK-NEXT:    adr r2, .LCPI18_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh r6, [r2]
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, lr, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r2, d1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w lr, [lr]
 ; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], lr
-; CHECK-NEXT:    vmov.16 q0[3], r6
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI18_0:
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 6 @ 0x6
-; CHECK-NEXT:    .long 12 @ 0xc
-; CHECK-NEXT:    .long 18 @ 0x12
-; CHECK-NEXT:  .LCPI18_1:
 ; CHECK-NEXT:    .long 24 @ 0x18
 ; CHECK-NEXT:    .long 131072 @ 0x20000
 ; CHECK-NEXT:    .long 36 @ 0x24
 ; CHECK-NEXT:    .long 42 @ 0x2a
+; CHECK-NEXT:  .LCPI18_1:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 18 @ 0x12
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 65536, i32 18, i32 21>
   %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
@@ -487,36 +467,32 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) {
 ; CHECK-LABEL: scaled_v8i16_i16_biggep5:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vmov.i32 q2, #0x20000
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    ldrh.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r1, lr, d1
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r3
+; CHECK-NEXT:    ldrh r6, [r3]
+; CHECK-NEXT:    ldrh.w r3, [lr]
+; CHECK-NEXT:    vmov.16 q0[2], r1
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.16 q0[7], r6
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536
   %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
@@ -526,51 +502,47 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) {
 ; CHECK-LABEL: scaled_v8i16_i16_biggep6:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI20_0
-; CHECK-NEXT:    adr.w r12, .LCPI20_1
+; CHECK-NEXT:    adr r2, .LCPI20_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh r6, [r2]
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, lr, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r2, d1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w lr, [lr]
 ; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], lr
-; CHECK-NEXT:    vmov.16 q0[3], r6
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI20_0:
-; CHECK-NEXT:    .long 2 @ 0x2
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 14 @ 0xe
-; CHECK-NEXT:    .long 20 @ 0x14
-; CHECK-NEXT:  .LCPI20_1:
 ; CHECK-NEXT:    .long 131074 @ 0x20002
 ; CHECK-NEXT:    .long 32 @ 0x20
 ; CHECK-NEXT:    .long 38 @ 0x26
 ; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:  .LCPI20_1:
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 8 @ 0x8
+; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 20 @ 0x14
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 65536, i32 15, i32 18, i32 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1
@@ -581,51 +553,47 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) {
 ; CHECK-LABEL: scaled_v8i16_i16_biggep7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI21_0
-; CHECK-NEXT:    adr.w r12, .LCPI21_1
+; CHECK-NEXT:    adr r2, .LCPI21_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh r6, [r2]
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, lr, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r2, d1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w lr, [lr]
 ; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], lr
-; CHECK-NEXT:    vmov.16 q0[3], r6
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI21_0:
-; CHECK-NEXT:    .long 128 @ 0x80
-; CHECK-NEXT:    .long 1206 @ 0x4b6
-; CHECK-NEXT:    .long 1212 @ 0x4bc
-; CHECK-NEXT:    .long 1218 @ 0x4c2
-; CHECK-NEXT:  .LCPI21_1:
 ; CHECK-NEXT:    .long 1224 @ 0x4c8
 ; CHECK-NEXT:    .long 1230 @ 0x4ce
 ; CHECK-NEXT:    .long 1236 @ 0x4d4
 ; CHECK-NEXT:    .long 1242 @ 0x4da
+; CHECK-NEXT:  .LCPI21_1:
+; CHECK-NEXT:    .long 128 @ 0x80
+; CHECK-NEXT:    .long 1206 @ 0x4b6
+; CHECK-NEXT:    .long 1212 @ 0x4bc
+; CHECK-NEXT:    .long 1218 @ 0x4c2
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 65000, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600
@@ -638,36 +606,32 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(i32* %base, <8 x i16>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrh.u32 q0, [r1, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #2
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrh.u32 q0, [r1]
-; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #2
-; CHECK-NEXT:    vshl.i32 q1, q1, #2
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrh.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r12, [r12]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
index de2a1e6fcca7c..54e51a4bb0a5c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
@@ -19,38 +19,34 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) {
 ; CHECK-LABEL: zext_unscaled_i8_i16_noext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, lr, d1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    vldrb.s32 q0, [r1]
-; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    ldrb.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    ldrb r6, [r2]
+; CHECK-NEXT:    ldrb.w r2, [r12]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w lr, [lr]
+; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.16 q0[0], r5
+; CHECK-NEXT:    vmov.16 q0[1], r5
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.16 q0[1], lr
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.16 q0[3], r12
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r3
+; CHECK-NEXT:    vmov.16 q0[6], r6
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
   %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs
@@ -64,36 +60,32 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i8>* %offpt
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrb.s32 q0, [r1]
-; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrh.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r12, [r12]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
@@ -108,36 +100,32 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(i16* %base, <8 x i8>* %offpt
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
-; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrh.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh.w r12, [r12]
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrh.w lr, [lr]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
index 053bfa12c1174..442ecc6267c52 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
@@ -461,16 +461,14 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q1, #0x10
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
index c4a469856cb1e..79c60dcf95ded 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
@@ -20,34 +20,30 @@ define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    vmov r3, lr, d1
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
-; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb.w r12, [r12]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    ldrb.w lr, [lr]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r12
 ; CHECK-NEXT:    vmov.16 q0[6], r3
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[7], lr
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
@@ -82,65 +78,57 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
-; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrb.s32 q0, [r1, #12]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vldrb.s32 q0, [r1]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %offs.sext = sext <16 x i8> %offs to <16 x i32>
@@ -152,65 +140,57 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrh.s32 q0, [r1, #16]
-; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrh.s32 q0, [r1, #24]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrh.s32 q0, [r1, #16]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
   %offs.sext = sext <16 x i16> %offs to <16 x i32>
@@ -222,69 +202,61 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_scaled:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrb.u32 q0, [r1, #8]
-; CHECK-NEXT:    vldrb.u32 q2, [r1, #4]
-; CHECK-NEXT:    vshl.i32 q0, q0, #2
-; CHECK-NEXT:    vshl.i32 q2, q2, #2
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrb.u32 q0, [r1, #12]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #2
-; CHECK-NEXT:    vmov r6, s4
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #2
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #2
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrb.u32 q2, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q2, q2, #2
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
   %offs.zext = zext <16 x i8> %offs to <16 x i32>
@@ -297,65 +269,57 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_i8_next:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r1, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r1, #16]
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r1, #48]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrw.u32 q0, [r1, #32]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r1, #16]
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
   %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
@@ -372,59 +336,51 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %
 ; CHECK-NEXT:    vmov.i32 q2, #0x5
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r3, [r2]
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q0, q2
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    vldrb.s32 q0, [r1]
-; CHECK-NEXT:    vmov r6, s4
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r4, s6
 ; CHECK-NEXT:    vadd.i32 q3, q0, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r7, s15
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r6, [r6]
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q0, q2
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d6
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q0[0], r5
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
 ; CHECK-NEXT:    vmov.8 q0[1], r5
-; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d7
 ; CHECK-NEXT:    vldrb.s32 q3, [r1, #4]
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
 ; CHECK-NEXT:    vadd.i32 q2, q3, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[2], r5
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r4
-; CHECK-NEXT:    vmov.8 q0[11], r5
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
 ; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
 ; CHECK-NEXT:    vmov.8 q0[14], r12
@@ -543,81 +499,73 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) {
 ; CHECK-LABEL: unscaled_v16i8_i8_biggep3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI11_0
-; CHECK-NEXT:    adr r2, .LCPI11_1
+; CHECK-NEXT:    adr r4, .LCPI11_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r6, .LCPI11_2
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adr r7, .LCPI11_3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    vmov r4, r6, d3
 ; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI11_3
-; CHECK-NEXT:    vldrw.u32 q2, [r6]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    adr r1, .LCPI11_2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r5]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    vmov r5, r6, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r0, [r5]
+; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r5, d4
+; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r5
+; CHECK-NEXT:    vmov r0, r5, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r0, r5, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.8 q0[12], lr
-; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r1
-; CHECK-NEXT:    vmov.8 q0[15], r2
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
+; CHECK-NEXT:    vmov.8 q0[12], r3
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI11_0:
-; CHECK-NEXT:    .long 280 @ 0x118
-; CHECK-NEXT:    .long 283 @ 0x11b
-; CHECK-NEXT:    .long 286 @ 0x11e
-; CHECK-NEXT:    .long 289 @ 0x121
-; CHECK-NEXT:  .LCPI11_1:
 ; CHECK-NEXT:    .long 292 @ 0x124
 ; CHECK-NEXT:    .long 295 @ 0x127
 ; CHECK-NEXT:    .long 298 @ 0x12a
 ; CHECK-NEXT:    .long 301 @ 0x12d
+; CHECK-NEXT:  .LCPI11_1:
+; CHECK-NEXT:    .long 280 @ 0x118
+; CHECK-NEXT:    .long 283 @ 0x11b
+; CHECK-NEXT:    .long 286 @ 0x11e
+; CHECK-NEXT:    .long 289 @ 0x121
 ; CHECK-NEXT:  .LCPI11_2:
 ; CHECK-NEXT:    .long 256 @ 0x100
 ; CHECK-NEXT:    .long 259 @ 0x103
@@ -639,81 +587,73 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) {
 ; CHECK-LABEL: unscaled_v16i8_i8_biggep4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI12_0
-; CHECK-NEXT:    adr r2, .LCPI12_1
+; CHECK-NEXT:    adr r4, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r6, .LCPI12_2
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adr r7, .LCPI12_3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    vmov r4, r6, d3
 ; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI12_3
-; CHECK-NEXT:    vldrw.u32 q2, [r6]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    adr r1, .LCPI12_2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r5]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    vmov r5, r6, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r0, [r5]
+; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r5, d4
+; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r5
+; CHECK-NEXT:    vmov r0, r5, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r0, r5, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.8 q0[12], lr
-; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r1
-; CHECK-NEXT:    vmov.8 q0[15], r2
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
+; CHECK-NEXT:    vmov.8 q0[12], r3
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI12_0:
-; CHECK-NEXT:    .long 256 @ 0x100
-; CHECK-NEXT:    .long 27 @ 0x1b
-; CHECK-NEXT:    .long 30 @ 0x1e
-; CHECK-NEXT:    .long 33 @ 0x21
-; CHECK-NEXT:  .LCPI12_1:
 ; CHECK-NEXT:    .long 36 @ 0x24
 ; CHECK-NEXT:    .long 39 @ 0x27
 ; CHECK-NEXT:    .long 42 @ 0x2a
 ; CHECK-NEXT:    .long 45 @ 0x2d
+; CHECK-NEXT:  .LCPI12_1:
+; CHECK-NEXT:    .long 256 @ 0x100
+; CHECK-NEXT:    .long 27 @ 0x1b
+; CHECK-NEXT:    .long 30 @ 0x1e
+; CHECK-NEXT:    .long 33 @ 0x21
 ; CHECK-NEXT:  .LCPI12_2:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 3 @ 0x3
@@ -734,65 +674,57 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) {
 ; CHECK-LABEL: unscaled_v16i8_i8_biggep5:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i32 q4, #0x100
-; CHECK-NEXT:    vadd.i32 q2, q2, q4
 ; CHECK-NEXT:    vadd.i32 q3, q3, q4
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vadd.i32 q2, q2, q4
+; CHECK-NEXT:    vmov r3, r2, d7
 ; CHECK-NEXT:    vadd.i32 q1, q1, q4
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    ldrb.w r12, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb.w lr, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r3, [r0]
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vadd.i32 q3, q0, q4
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[0], r5
-; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r5]
+; CHECK-NEXT:    vmov r2, r5, d6
+; CHECK-NEXT:    ldrb r4, [r0]
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r2
 ; CHECK-NEXT:    vmov.8 q0[1], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[2], r5
-; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    vmov r2, r5, d7
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[2], r2
 ; CHECK-NEXT:    vmov.8 q0[3], r5
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[4], r5
-; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    vmov r2, r5, d2
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[4], r2
 ; CHECK-NEXT:    vmov.8 q0[5], r5
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[6], r5
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r2, r5, d3
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[6], r2
 ; CHECK-NEXT:    vmov.8 q0[7], r5
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    vmov.8 q0[8], r4
+; CHECK-NEXT:    vmov r2, r5, d4
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r2
 ; CHECK-NEXT:    vmov.8 q0[9], r5
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], lr
-; CHECK-NEXT:    vmov.8 q0[12], r2
+; CHECK-NEXT:    vmov.8 q0[10], r1
+; CHECK-NEXT:    vmov.8 q0[11], r0
+; CHECK-NEXT:    vmov.8 q0[12], r4
 ; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r0
-; CHECK-NEXT:    vmov.8 q0[15], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
 	%ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256
   %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@@ -803,81 +735,73 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) {
 ; CHECK-LABEL: unscaled_v16i8_i8_biggep6:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI14_0
-; CHECK-NEXT:    adr r2, .LCPI14_1
+; CHECK-NEXT:    adr r4, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r6, .LCPI14_2
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adr r7, .LCPI14_3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    vmov r4, r6, d3
 ; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI14_3
-; CHECK-NEXT:    vldrw.u32 q2, [r6]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    adr r1, .LCPI14_2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r5]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    vmov r5, r6, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r0, [r5]
+; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r5, d4
+; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r5
+; CHECK-NEXT:    vmov r0, r5, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r0, r5, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.8 q0[12], lr
-; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r1
-; CHECK-NEXT:    vmov.8 q0[15], r2
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
+; CHECK-NEXT:    vmov.8 q0[12], r3
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI14_0:
-; CHECK-NEXT:    .long 257 @ 0x101
-; CHECK-NEXT:    .long 28 @ 0x1c
-; CHECK-NEXT:    .long 31 @ 0x1f
-; CHECK-NEXT:    .long 34 @ 0x22
-; CHECK-NEXT:  .LCPI14_1:
 ; CHECK-NEXT:    .long 37 @ 0x25
 ; CHECK-NEXT:    .long 40 @ 0x28
 ; CHECK-NEXT:    .long 43 @ 0x2b
 ; CHECK-NEXT:    .long 46 @ 0x2e
+; CHECK-NEXT:  .LCPI14_1:
+; CHECK-NEXT:    .long 257 @ 0x101
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 31 @ 0x1f
+; CHECK-NEXT:    .long 34 @ 0x22
 ; CHECK-NEXT:  .LCPI14_2:
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 4 @ 0x4
@@ -899,81 +823,73 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) {
 ; CHECK-LABEL: unscaled_v16i8_i8_biggep7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    adr r1, .LCPI15_0
-; CHECK-NEXT:    adr r2, .LCPI15_1
+; CHECK-NEXT:    adr r4, .LCPI15_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r6, .LCPI15_2
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adr r7, .LCPI15_3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    vmov r4, r6, d3
 ; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI15_3
-; CHECK-NEXT:    vldrw.u32 q2, [r6]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    adr r1, .LCPI15_2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb r1, [r5]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    vmov r5, r6, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r5
+; CHECK-NEXT:    ldrb r5, [r6]
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r0, [r5]
+; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r5, d4
+; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r5
+; CHECK-NEXT:    vmov r0, r5, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r0, r5, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.8 q0[12], lr
-; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r1
-; CHECK-NEXT:    vmov.8 q0[15], r2
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
+; CHECK-NEXT:    vmov.8 q0[12], r3
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI15_0:
-; CHECK-NEXT:    .long 224 @ 0xe0
-; CHECK-NEXT:    .long 227 @ 0xe3
-; CHECK-NEXT:    .long 230 @ 0xe6
-; CHECK-NEXT:    .long 233 @ 0xe9
-; CHECK-NEXT:  .LCPI15_1:
 ; CHECK-NEXT:    .long 236 @ 0xec
 ; CHECK-NEXT:    .long 239 @ 0xef
 ; CHECK-NEXT:    .long 242 @ 0xf2
 ; CHECK-NEXT:    .long 245 @ 0xf5
+; CHECK-NEXT:  .LCPI15_1:
+; CHECK-NEXT:    .long 224 @ 0xe0
+; CHECK-NEXT:    .long 227 @ 0xe3
+; CHECK-NEXT:    .long 230 @ 0xe6
+; CHECK-NEXT:    .long 233 @ 0xe9
 ; CHECK-NEXT:  .LCPI15_2:
 ; CHECK-NEXT:    .long 300 @ 0x12c
 ; CHECK-NEXT:    .long 203 @ 0xcb
@@ -995,65 +911,57 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_i8_2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
-; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrb.s32 q0, [r1, #12]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #8]
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vldrb.s32 q0, [r1]
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
@@ -1097,69 +1005,61 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) {
 ; CHECK-LABEL: unscaled_v16i8_basei16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrb.u32 q0, [r1, #8]
-; CHECK-NEXT:    vldrb.u32 q2, [r1, #4]
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vshl.i32 q2, q2, #1
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrb.u32 q0, [r1, #12]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vmov r6, s4
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q3, q0, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #8]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    ldrb r3, [r4]
+; CHECK-NEXT:    ldrb r2, [r5]
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r4, r6, d3
+; CHECK-NEXT:    vmov.8 q0[1], r5
+; CHECK-NEXT:    ldrb r5, [r4]
+; CHECK-NEXT:    ldrb r4, [r6]
+; CHECK-NEXT:    vmov r6, r7, d5
+; CHECK-NEXT:    vldrb.u32 q2, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    ldrb r0, [r6]
+; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov.8 q0[3], r7
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r6
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r5
-; CHECK-NEXT:    vmov.8 q0[12], lr
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov.8 q0[10], r5
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    vmov.8 q0[12], r3
 ; CHECK-NEXT:    vmov.8 q0[13], r2
-; CHECK-NEXT:    vmov.8 q0[14], r3
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.8 q0[14], r12
+; CHECK-NEXT:    vmov.8 q0[15], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %offs.zext = zext <16 x i8> %offs to <16 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 7b9a10571a839..755011ad0d146 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -32,31 +32,27 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldr.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    ldr.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldr r7, [r2]
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    ldr r6, [r1]
 ; CHECK-NEXT:    ldr r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldr r4, [r4]
-; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    ldr.w r1, [r12]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
+; CHECK-NEXT:    ldr.w r5, [lr]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    ldr r4, [r4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
   %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
@@ -68,50 +64,42 @@ define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r7, s5
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    ldr r7, [r2]
+; CHECK-NEXT:    vmov r2, r6, d0
 ; CHECK-NEXT:    ldr.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    ldr r4, [r4]
 ; CHECK-NEXT:    ldr r5, [r5]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r12
+; CHECK-NEXT:    ldr.w r1, [lr]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r7
+; CHECK-NEXT:    ldr r2, [r2]
 ; CHECK-NEXT:    ldr r6, [r6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vmov r2, r4, d3
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r5
+; CHECK-NEXT:    vmov r6, r5, d2
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    ldr r7, [r7]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r4, [r4]
-; CHECK-NEXT:    ldr.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldr r3, [r1]
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r5
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r6
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    ldr r6, [r6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov r6, s13
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, lr
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
+; CHECK-NEXT:    ldr r6, [r4]
+; CHECK-NEXT:    vmov r0, r2, d5
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r6
+; CHECK-NEXT:    vmov r6, r5, d4
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    ldr r6, [r6]
-; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r12
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r5
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    ldr r5, [r5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r6, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r2
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
@@ -149,25 +137,23 @@ entry:
 define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) {
 ; CHECK-LABEL: ptr_v8f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vldr s3, [r1]
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vldr s2, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r12, r2, d1
+; CHECK-NEXT:    vmov lr, r1, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vldr s3, [r2]
+; CHECK-NEXT:    vldr s2, [r12]
 ; CHECK-NEXT:    vldr s1, [r1]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vldr s7, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vldr s0, [r1]
+; CHECK-NEXT:    vldr s7, [r3]
 ; CHECK-NEXT:    vldr s6, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vldr s5, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vldr s4, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vldr s5, [r5]
+; CHECK-NEXT:    vldr s0, [lr]
+; CHECK-NEXT:    vldr s4, [r4]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x float*>, <8 x float*>* %offptr, align 4
   %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
@@ -179,35 +165,31 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @ptr_i16(<8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    vmov r3, r12, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r6, [r3]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q0[0], r4
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    vmov.16 q0[1], r5
-; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], r12
-; CHECK-NEXT:    vmov.16 q0[3], lr
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    ldrh.w r3, [lr]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    ldrh.w r12, [r12]
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.16 q0[6], r6
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
   %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
@@ -253,10 +235,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r1, [r1]
@@ -276,10 +256,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r1, [r1]
@@ -298,33 +276,29 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrh r7, [r2]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w r2, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh.w r6, [lr]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r7
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
   %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
@@ -335,33 +309,29 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrh r7, [r2]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh.w r2, [r12]
+; CHECK-NEXT:    ldrh r4, [r4]
 ; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh.w r6, [lr]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    vmovlb.u16 q0, q0
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r7
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
   %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
@@ -375,27 +345,23 @@ define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vldr.16 s8, [r2]
 ; CHECK-NEXT:    vldr.16 s0, [r1]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vldr.16 s4, [r2]
+; CHECK-NEXT:    vmov r1, r2, d3
 ; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldr.16 s4, [r2]
 ; CHECK-NEXT:    vldr.16 s1, [r1]
 ; CHECK-NEXT:    vins.f16 s1, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x half*>, <8 x half*>* %offptr, align 4
@@ -408,61 +374,53 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r6, s11
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r6, [r6]
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    ldrb r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov.8 q0[0], r5
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[1], r5
-; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    vmov r6, r7, d4
+; CHECK-NEXT:    vmov r4, r3, d1
+; CHECK-NEXT:    ldrb r5, [r1]
+; CHECK-NEXT:    ldrb r1, [r2]
+; CHECK-NEXT:    ldrb r2, [r6]
+; CHECK-NEXT:    ldrb.w r12, [r3]
+; CHECK-NEXT:    vmov.8 q0[0], r2
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    ldrb.w lr, [r4]
+; CHECK-NEXT:    ldrb r4, [r2]
+; CHECK-NEXT:    ldrb r2, [r3]
+; CHECK-NEXT:    ldrb r3, [r7]
+; CHECK-NEXT:    vmov.8 q0[1], r3
+; CHECK-NEXT:    vmov r3, r6, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.8 q0[2], r5
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb r6, [r6]
+; CHECK-NEXT:    vmov.8 q0[2], r3
+; CHECK-NEXT:    vmov r0, r3, d4
 ; CHECK-NEXT:    vmov.8 q0[3], r6
-; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmov.8 q0[5], r3
+; CHECK-NEXT:    vmov r0, r3, d5
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[8], r5
+; CHECK-NEXT:    vmov.8 q0[7], r3
+; CHECK-NEXT:    vmov r0, r3, d2
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov.8 q0[10], r12
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.8 q0[12], lr
-; CHECK-NEXT:    vmov.8 q0[13], r3
-; CHECK-NEXT:    vmov.8 q0[14], r1
-; CHECK-NEXT:    vmov.8 q0[15], r2
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.8 q0[8], r0
+; CHECK-NEXT:    vmov.8 q0[9], r3
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    vmov.8 q0[11], r2
+; CHECK-NEXT:    vmov.8 q0[12], r5
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4
   %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@@ -472,36 +430,32 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(<8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_sext16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r3, r1, d1
+; CHECK-NEXT:    vmov r12, r2, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov lr, r0, d1
+; CHECK-NEXT:    ldrb r7, [r1]
+; CHECK-NEXT:    ldrb.w r1, [r12]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r5
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb.w r6, [lr]
+; CHECK-NEXT:    vmov.16 q0[1], r5
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.16 q0[1], lr
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], r3
-; CHECK-NEXT:    vmov.16 q0[3], r12
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r6
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.16 q0[4], r1
 ; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r7
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
@@ -512,36 +466,32 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(<8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_zext16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r3, r1, d1
+; CHECK-NEXT:    vmov r12, r2, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldrb.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov lr, r0, d1
+; CHECK-NEXT:    ldrb r7, [r1]
+; CHECK-NEXT:    ldrb.w r1, [r12]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.16 q0[0], r5
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb.w r6, [lr]
+; CHECK-NEXT:    vmov.16 q0[1], r5
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.16 q0[1], lr
-; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.16 q0[2], r3
-; CHECK-NEXT:    vmov.16 q0[3], r12
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[2], r6
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.16 q0[4], r1
 ; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r7
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
@@ -553,16 +503,14 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i8_sext32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    bx lr
@@ -577,18 +525,16 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i8_zext32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov.i32 q0, #0xff
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
+; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
@@ -600,35 +546,31 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_sext32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrb r7, [r2]
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    ldrb r6, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb.w r1, [r12]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
+; CHECK-NEXT:    ldrb.w r5, [lr]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
@@ -639,34 +581,30 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_zext32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    ldrb.w lr, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vmov r12, r3, d0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrb r7, [r2]
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb.w r2, [r12]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w r6, [lr]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r7
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
@@ -783,37 +721,33 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) {
 ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    vldrb.u32 q0, [r1]
-; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    ldrb.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    ldrb r6, [r2]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    ldrb.w r12, [r12]
+; CHECK-NEXT:    ldrb.w r2, [lr]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.zext = zext <8 x i8> %offs to <8 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
index 6b89010baf3d7..71de7ba335f1f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
@@ -7,16 +7,14 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>*
 ; NOGATSCAT:       @ %bb.0: @ %entry
 ; NOGATSCAT-NEXT:    vldrw.u32 q0, [r1]
 ; NOGATSCAT-NEXT:    vadd.i32 q0, q0, r0
-; NOGATSCAT-NEXT:    vmov r0, s2
-; NOGATSCAT-NEXT:    vmov r1, s0
-; NOGATSCAT-NEXT:    vmov r2, s3
-; NOGATSCAT-NEXT:    vmov r3, s1
+; NOGATSCAT-NEXT:    vmov r0, r1, d1
+; NOGATSCAT-NEXT:    vmov r2, r3, d0
 ; NOGATSCAT-NEXT:    ldr r0, [r0]
-; NOGATSCAT-NEXT:    ldr r1, [r1]
 ; NOGATSCAT-NEXT:    ldr r2, [r2]
+; NOGATSCAT-NEXT:    ldr r1, [r1]
 ; NOGATSCAT-NEXT:    ldr r3, [r3]
-; NOGATSCAT-NEXT:    vmov q0[2], q0[0], r1, r0
-; NOGATSCAT-NEXT:    vmov q0[3], q0[1], r3, r2
+; NOGATSCAT-NEXT:    vmov q0[2], q0[0], r2, r0
+; NOGATSCAT-NEXT:    vmov q0[3], q0[1], r3, r1
 ; NOGATSCAT-NEXT:    bx lr
 ;
 ; NOMVE-LABEL: unscaled_i32_i32_gather:
@@ -46,21 +44,19 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <
 define arm_aapcs_vfpcc void @unscaled_i32_i8_scatter(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
 ; NOGATSCAT-LABEL: unscaled_i32_i8_scatter:
 ; NOGATSCAT:       @ %bb.0: @ %entry
+; NOGATSCAT-NEXT:    .save {r4, r5, r7, lr}
+; NOGATSCAT-NEXT:    push {r4, r5, r7, lr}
 ; NOGATSCAT-NEXT:    vldrb.u32 q1, [r1]
-; NOGATSCAT-NEXT:    vmov r1, s0
+; NOGATSCAT-NEXT:    vmov r1, r3, d0
+; NOGATSCAT-NEXT:    vmov r4, r5, d1
 ; NOGATSCAT-NEXT:    vadd.i32 q1, q1, r0
-; NOGATSCAT-NEXT:    vmov r0, s4
-; NOGATSCAT-NEXT:    str r1, [r0]
-; NOGATSCAT-NEXT:    vmov r0, s5
-; NOGATSCAT-NEXT:    vmov r1, s1
+; NOGATSCAT-NEXT:    vmov r0, r12, d2
+; NOGATSCAT-NEXT:    vmov r2, lr, d3
 ; NOGATSCAT-NEXT:    str r1, [r0]
-; NOGATSCAT-NEXT:    vmov r0, s6
-; NOGATSCAT-NEXT:    vmov r1, s2
-; NOGATSCAT-NEXT:    str r1, [r0]
-; NOGATSCAT-NEXT:    vmov r0, s7
-; NOGATSCAT-NEXT:    vmov r1, s3
-; NOGATSCAT-NEXT:    str r1, [r0]
-; NOGATSCAT-NEXT:    bx lr
+; NOGATSCAT-NEXT:    str.w r3, [r12]
+; NOGATSCAT-NEXT:    str r4, [r2]
+; NOGATSCAT-NEXT:    str.w r5, [lr]
+; NOGATSCAT-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; NOMVE-LABEL: unscaled_i32_i8_scatter:
 ; NOMVE:       @ %bb.0: @ %entry

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index db9c286a44be8..bf601d71761cc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -4,62 +4,58 @@
 define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C) {
 ; CHECK-LABEL: loads_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vldrw.u32 q5, [r2]
-; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s8, s20
+; CHECK-NEXT:    vmov r4, r1, d4
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov r5, s12
 ; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s10, s21
 ; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmov.f32 s20, s26
-; CHECK-NEXT:    vmov.f32 s22, s27
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vand q5, q5, q3
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov.f32 s26, s25
-; CHECK-NEXT:    vand q3, q6, q3
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    asrs r3, r0, #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    asrl r0, r1, r2
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, r12, d5
+; CHECK-NEXT:    vmov.f32 s8, s20
+; CHECK-NEXT:    vmov.f32 s10, s21
+; CHECK-NEXT:    adds r2, r5, r4
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    asr.w r6, r5, #31
+; CHECK-NEXT:    adcs r1, r6
+; CHECK-NEXT:    asrl r2, r1, r4
 ; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    adds r4, r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, r3
-; CHECK-NEXT:    asrl r4, r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, r3
+; CHECK-NEXT:    adds r6, r1, r3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    asr.w r4, r1, #31
+; CHECK-NEXT:    adc.w r1, r4, lr
+; CHECK-NEXT:    asrl r6, r1, r3
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    adc.w r1, r3, r12
 ; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    asrl r2, r1, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    asrl r0, r1, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    adds r6, r1, r5
+; CHECK-NEXT:    asr.w r2, r1, #31
+; CHECK-NEXT:    adc.w r1, r2, r4
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    asrl r6, r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
   %b = load <4 x i32>, <4 x i32> *%B, align 4
@@ -142,63 +138,62 @@ entry:
 define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
 ; CHECK-LABEL: load_store_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vldrw.u32 q5, [r2]
-; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s4, s20
+; CHECK-NEXT:    vmov r5, r1, d2
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov r6, s12
 ; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s6, s21
 ; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmov.f32 s20, s26
-; CHECK-NEXT:    vmov.f32 s22, s27
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vand q5, q5, q3
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov.f32 s26, s25
-; CHECK-NEXT:    vand q3, q6, q3
+; CHECK-NEXT:    vmov r4, lr, d4
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov lr, s13
-; CHECK-NEXT:    asr.w r12, r0, #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    asrl r0, r1, r2
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov.f32 s4, s20
+; CHECK-NEXT:    vmov.f32 s6, s21
+; CHECK-NEXT:    adds r2, r6, r5
+; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    asr.w r7, r6, #31
+; CHECK-NEXT:    adcs r1, r7
+; CHECK-NEXT:    asrl r2, r1, r5
+; CHECK-NEXT:    vmov r7, s4
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    adds r4, r1, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, lr
-; CHECK-NEXT:    asrl r4, r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r4, s15
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, lr
-; CHECK-NEXT:    vmov r12, s18
-; CHECK-NEXT:    asrl r2, r1, r12
-; CHECK-NEXT:    asr.w r12, r0, #31
+; CHECK-NEXT:    adds r4, r4, r1
+; CHECK-NEXT:    asr.w r5, r1, #31
+; CHECK-NEXT:    adc.w r1, r5, lr
+; CHECK-NEXT:    asrl r4, r1, r7
+; CHECK-NEXT:    vmov r6, r5, d5
 ; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
 ; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r12, r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    asrl r0, r1, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
+; CHECK-NEXT:    asr.w r7, r1, #31
+; CHECK-NEXT:    adc.w r1, r7, r12
+; CHECK-NEXT:    vmov r7, s18
+; CHECK-NEXT:    asrl r0, r1, r7
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    adds r6, r6, r1
+; CHECK-NEXT:    asr.w r2, r1, #31
+; CHECK-NEXT:    adc.w r1, r2, r5
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    asrl r6, r1, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r6, r0
 ; CHECK-NEXT:    vstrw.32 q2, [r3]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
   %b = load <4 x i32>, <4 x i32> *%B, align 4
@@ -377,33 +372,31 @@ define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    ldr.w lr, [sp, #20]
 ; CHECK-NEXT:    vmov.f32 s8, s0
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    smull r12, r3, r1, r0
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    vmov r4, s14
 ; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s10, s3
 ; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    asrl r12, r3, r2
 ; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmullb.s32 q1, q0, q2
-; CHECK-NEXT:    vmov r7, s7
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    smull r0, r3, r1, r0
-; CHECK-NEXT:    ldr r1, [sp, #20]
-; CHECK-NEXT:    asrl r0, r3, r2
-; CHECK-NEXT:    smull r12, r5, r4, r5
-; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r6, r1, d2
+; CHECK-NEXT:    vmov r4, r7, d3
+; CHECK-NEXT:    asrl r6, r1, r2
 ; CHECK-NEXT:    asrl r4, r7, r2
-; CHECK-NEXT:    vmov r7, s5
-; CHECK-NEXT:    asrl r6, r7, r2
-; CHECK-NEXT:    asrl r12, r5, r2
+; CHECK-NEXT:    smull r0, r5, r5, r0
+; CHECK-NEXT:    asrl r0, r5, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r6
 ; CHECK-NEXT:    vmov q0[3], q0[1], r12, r4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vstrw.32 q0, [lr]
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index 47cd236ccbb6f..8e7a7eb5dad9b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -180,52 +180,45 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vand q4, q4, q3
 ; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q3, q1, q3
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d2
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov lr, s19
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r12, lr, d7
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    asr.w r5, r4, #31
+; CHECK-NEXT:    adcs r1, r5
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r3, s13
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, r3
-; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    asr.w r4, r1, #31
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    lsrl r2, r3, #1
+; CHECK-NEXT:    vmov r1, r5, d3
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds r4, r1, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    adc.w r1, r12, lr
-; CHECK-NEXT:    lsrl r4, r1, #1
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adds.w r4, r3, r12
+; CHECK-NEXT:    asr.w r6, r3, #31
+; CHECK-NEXT:    adc.w r3, r6, lr
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r5
+; CHECK-NEXT:    lsrl r4, r3, #1
 ; CHECK-NEXT:    lsrl r0, r1, #1
 ; CHECK-NEXT:    vmov q1[3], q1[1], r0, r4
 ; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %sa = sext <4 x i32> %a to <4 x i64>
   %sb = zext <4 x i32> %b to <4 x i64>
@@ -282,112 +275,105 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.i32 q2, #0xff
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r4, q1[6]
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmovlb.s8 q4, q4
 ; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmovlb.s16 q4, q4
+; CHECK-NEXT:    vmov.u8 r5, q1[4]
+; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vshr.u32 q3, q3, #1
+; CHECK-NEXT:    vmov lr, r12, d7
+; CHECK-NEXT:    vmov r3, r2, d6
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vmovlb.s8 q4, q4
 ; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vadd.i32 q2, q4, q2
-; CHECK-NEXT:    vshr.u32 q4, q2, #1
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vand q4, q4, q3
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmovlb.s16 q5, q5
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vshr.u32 q4, q4, #1
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.u8 r1, q1[9]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vand q4, q4, q3
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[7]
+; CHECK-NEXT:    vmov.u8 r5, q1[5]
+; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    vshr.u32 q3, q3, #1
+; CHECK-NEXT:    vmov q5[2], q5[0], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov q5[3], q5[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
 ; CHECK-NEXT:    vmovlb.s8 q5, q5
+; CHECK-NEXT:    vmov.u8 r5, q0[8]
 ; CHECK-NEXT:    vmovlb.s16 q5, q5
+; CHECK-NEXT:    vmov r1, r0, d6
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov q5[2], q5[0], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    vmov q5[3], q5[1], r5, r4
+; CHECK-NEXT:    vmov.8 q0[0], r1
+; CHECK-NEXT:    vmov.u8 r4, q1[10]
+; CHECK-NEXT:    vmov.u8 r5, q1[8]
+; CHECK-NEXT:    vmov q6[2], q6[0], r5, r4
+; CHECK-NEXT:    vmov.8 q0[1], r0
+; CHECK-NEXT:    vmov r0, r1, d7
+; CHECK-NEXT:    vmov.u8 r4, q1[11]
+; CHECK-NEXT:    vmov.u8 r5, q1[9]
+; CHECK-NEXT:    vmov.8 q0[2], r0
+; CHECK-NEXT:    vmov q6[3], q6[1], r5, r4
 ; CHECK-NEXT:    vshr.u32 q4, q4, #1
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vand q1, q4, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.s8 q0, q3
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vshr.u32 q0, q0, #1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.8 q0[3], r1
+; CHECK-NEXT:    vmov r0, r1, d8
+; CHECK-NEXT:    vand q1, q6, q2
+; CHECK-NEXT:    vmovlb.s8 q2, q5
+; CHECK-NEXT:    vmov.8 q0[4], r0
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vadd.i32 q1, q2, q1
+; CHECK-NEXT:    vmov r4, r5, d9
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vshr.u32 q1, q1, #1
+; CHECK-NEXT:    vmov.8 q0[6], r4
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov.8 q0[7], r5
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    vmov.8 q0[8], r4
+; CHECK-NEXT:    vmov.8 q0[9], r5
+; CHECK-NEXT:    vmov.8 q0[10], r0
+; CHECK-NEXT:    vmov.8 q0[11], r1
+; CHECK-NEXT:    vmov.8 q0[12], r3
+; CHECK-NEXT:    vmov.8 q0[13], r2
+; CHECK-NEXT:    vmov.8 q0[14], lr
+; CHECK-NEXT:    vmov.8 q0[15], r12
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sa = sext <16 x i8> %a to <16 x i32>
   %sb = zext <16 x i8> %b to <16 x i32>
@@ -406,114 +392,115 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
+; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
 ; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vand q2, q2, q4
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov r10, s12
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r9, s0
-; CHECK-NEXT:    vmov r11, s4
-; CHECK-NEXT:    vmov r7, s5
-; CHECK-NEXT:    adds.w r2, r10, r4
-; CHECK-NEXT:    asr.w r0, r10, #31
-; CHECK-NEXT:    adc.w r5, r0, r1
-; CHECK-NEXT:    asrl r2, r5, r4
-; CHECK-NEXT:    subs r6, r2, r4
-; CHECK-NEXT:    sbc.w r12, r5, r1
-; CHECK-NEXT:    adds.w r0, r9, r11
-; CHECK-NEXT:    asr.w r2, r9, #31
-; CHECK-NEXT:    adc.w r3, r2, r7
-; CHECK-NEXT:    umull r8, r5, r6, r4
-; CHECK-NEXT:    asrl r0, r3, r11
-; CHECK-NEXT:    subs.w r0, r0, r11
-; CHECK-NEXT:    mla r5, r12, r4, r5
-; CHECK-NEXT:    sbc.w r12, r3, r7
-; CHECK-NEXT:    umull r2, r1, r0, r11
-; CHECK-NEXT:    muls r0, r7, r0
-; CHECK-NEXT:    vmov r7, s14
-; CHECK-NEXT:    orr.w lr, r1, r0
-; CHECK-NEXT:    rsb.w r0, r10, #0
-; CHECK-NEXT:    lsll r8, r5, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    lsll r8, r5, r4
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    eor.w r4, r4, r10
-; CHECK-NEXT:    orr.w r4, r4, r10, asr #31
-; CHECK-NEXT:    asrs r3, r7, #31
-; CHECK-NEXT:    adds r6, r7, r0
-; CHECK-NEXT:    adcs r3, r1
-; CHECK-NEXT:    asrl r6, r3, r0
-; CHECK-NEXT:    subs r6, r6, r0
-; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    umull r6, r3, r6, r0
-; CHECK-NEXT:    mla r1, r1, r0, r3
-; CHECK-NEXT:    rsbs r3, r7, #0
-; CHECK-NEXT:    lsll r6, r1, r3
-; CHECK-NEXT:    lsll r6, r1, r0
-; CHECK-NEXT:    eors r0, r7
-; CHECK-NEXT:    vmov q3[2], q3[0], r8, r6
-; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    orr.w r0, r0, r7, asr #31
-; CHECK-NEXT:    adds.w r8, r6, r5
-; CHECK-NEXT:    eor.w r7, r6, r5
-; CHECK-NEXT:    asr.w r3, r6, #31
-; CHECK-NEXT:    orr.w r7, r7, r6, asr #31
-; CHECK-NEXT:    adcs r3, r1
-; CHECK-NEXT:    asrl r8, r3, r5
-; CHECK-NEXT:    subs.w r8, r8, r5
-; CHECK-NEXT:    sbcs r3, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    mul r1, r8, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov r12, r2, d5
+; CHECK-NEXT:    vmov r8, r9, d3
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov lr, s2
+; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    adds.w r4, r1, r12
+; CHECK-NEXT:    asr.w r0, r1, #31
+; CHECK-NEXT:    adc.w r5, r0, r2
+; CHECK-NEXT:    asrl r4, r5, r12
+; CHECK-NEXT:    subs.w r0, r4, r12
+; CHECK-NEXT:    sbc.w r2, r5, r2
+; CHECK-NEXT:    asr.w r5, lr, #31
+; CHECK-NEXT:    umull r0, r4, r0, r12
+; CHECK-NEXT:    adds.w r6, lr, r8
+; CHECK-NEXT:    adc.w r5, r5, r9
+; CHECK-NEXT:    asrl r6, r5, r8
+; CHECK-NEXT:    mla r3, r2, r12, r4
+; CHECK-NEXT:    subs.w r7, r6, r8
+; CHECK-NEXT:    sbc.w r10, r5, r9
+; CHECK-NEXT:    rsbs r2, r1, #0
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    lsll r0, r3, r2
+; CHECK-NEXT:    vmov r6, r2, d4
+; CHECK-NEXT:    lsll r0, r3, r12
+; CHECK-NEXT:    asrs r3, r5, #31
+; CHECK-NEXT:    adds r4, r5, r6
+; CHECK-NEXT:    adcs r3, r2
+; CHECK-NEXT:    asrl r4, r3, r6
+; CHECK-NEXT:    subs r4, r4, r6
+; CHECK-NEXT:    sbc.w r2, r3, r2
+; CHECK-NEXT:    umull r4, r3, r4, r6
+; CHECK-NEXT:    mla r3, r2, r6, r3
+; CHECK-NEXT:    rsbs r2, r5, #0
+; CHECK-NEXT:    lsll r4, r3, r2
+; CHECK-NEXT:    lsll r4, r3, r6
+; CHECK-NEXT:    eors r6, r5
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r0
+; CHECK-NEXT:    umull r2, r0, r7, r8
+; CHECK-NEXT:    orr.w r6, r6, r5, asr #31
+; CHECK-NEXT:    mul r3, r7, r9
+; CHECK-NEXT:    vmov r7, s0
+; CHECK-NEXT:    orrs r0, r3
+; CHECK-NEXT:    vmov r3, r4, d2
+; CHECK-NEXT:    mla r11, r10, r8, r0
+; CHECK-NEXT:    asr.w r9, r7, #31
+; CHECK-NEXT:    adds r0, r7, r3
+; CHECK-NEXT:    adc.w r9, r9, r4
+; CHECK-NEXT:    asrl r0, r9, r3
+; CHECK-NEXT:    subs.w r10, r0, r3
+; CHECK-NEXT:    sbc.w r9, r9, r4
+; CHECK-NEXT:    umull r0, r1, r10, r3
+; CHECK-NEXT:    mul r4, r10, r4
+; CHECK-NEXT:    orr.w r10, r1, r4
+; CHECK-NEXT:    eor.w r1, lr, r8
+; CHECK-NEXT:    orr.w r1, r1, lr, asr #31
+; CHECK-NEXT:    eor.w r4, r7, r3
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    orr.w r4, r4, r7, asr #31
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    rsbs r7, r7, #0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    cset r4, eq
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov.32 q4[1], r4
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r0
-; CHECK-NEXT:    umull r4, r0, r8, r5
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    eor.w r1, r9, r11
-; CHECK-NEXT:    orr.w r1, r1, r9, asr #31
+; CHECK-NEXT:    vmov.32 q0[1], r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r1
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vbic q4, q1, q0
+; CHECK-NEXT:    eor.w r1, r4, r12
+; CHECK-NEXT:    orr.w r1, r1, r4, asr #31
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    cset r7, eq
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    csetm r7, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r7
-; CHECK-NEXT:    mla r7, r3, r5, r0
-; CHECK-NEXT:    rsbs r1, r6, #0
-; CHECK-NEXT:    vbic q5, q1, q0
-; CHECK-NEXT:    mla r3, r12, r11, lr
-; CHECK-NEXT:    rsb.w r0, r9, #0
-; CHECK-NEXT:    lsll r4, r7, r1
-; CHECK-NEXT:    vbic q1, q2, q4
-; CHECK-NEXT:    lsll r2, r3, r0
-; CHECK-NEXT:    vand q2, q3, q4
-; CHECK-NEXT:    lsll r4, r7, r5
-; CHECK-NEXT:    lsll r2, r3, r11
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cset r6, eq
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r6, ne
+; CHECK-NEXT:    vmov.32 q5[1], r6
+; CHECK-NEXT:    vmov q5[2], q5[0], r6, r1
+; CHECK-NEXT:    mla r1, r9, r3, r10
+; CHECK-NEXT:    rsb.w r6, lr, #0
+; CHECK-NEXT:    vbic q1, q2, q5
+; CHECK-NEXT:    lsll r2, r11, r6
+; CHECK-NEXT:    lsll r0, r1, r7
+; CHECK-NEXT:    vand q2, q3, q5
+; CHECK-NEXT:    lsll r2, r11, r8
+; CHECK-NEXT:    lsll r0, r1, r3
 ; CHECK-NEXT:    vorr q1, q2, q1
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q5
+; CHECK-NEXT:    vorr q0, q0, q4
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index 02895b0a214ca..352a7f797a438 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -1750,28 +1750,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2 x i64> %a) {
 ; CHECK-LE-LABEL: masked_v2i64_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov r3, s0
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmov r1, s1
-; CHECK-LE-NEXT:    vmov r12, s3
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vmov r3, s2
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r1
+; CHECK-LE-NEXT:    vmov r1, r2, d0
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    vmov lr, r12, d1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r3, r2
 ; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r12
+; CHECK-LE-NEXT:    rsbs.w r2, lr, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r2, #1
-; CHECK-LE-NEXT:    cmp r2, #0
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r2, #3
-; CHECK-LE-NEXT:    lsls r2, r2, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r3, #3
+; CHECK-LE-NEXT:    lsls r2, r3, #31
 ; CHECK-LE-NEXT:    beq .LBB49_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-LE-NEXT:    vldr d1, .LCPI49_0
@@ -1784,7 +1784,7 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vldrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    bx lr
+; CHECK-LE-NEXT:    pop {r7, pc}
 ; CHECK-LE-NEXT:    .p2align 3
 ; CHECK-LE-NEXT:  @ %bb.4:
 ; CHECK-LE-NEXT:  .LCPI49_0:
@@ -1793,29 +1793,29 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2
 ;
 ; CHECK-BE-LABEL: masked_v2i64_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vmov r3, s7
-; CHECK-BE-NEXT:    vmov r1, s6
-; CHECK-BE-NEXT:    vmov r12, s4
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vmov r3, s5
-; CHECK-BE-NEXT:    sbcs.w r1, r2, r1
+; CHECK-BE-NEXT:    movs r3, #0
+; CHECK-BE-NEXT:    vmov r1, r2, d3
+; CHECK-BE-NEXT:    vmov r12, lr, d2
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    sbcs.w r3, r2, r12
+; CHECK-BE-NEXT:    rsbs.w r2, lr, #0
+; CHECK-BE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-BE-NEXT:    it lt
-; CHECK-BE-NEXT:    movlt r2, #1
-; CHECK-BE-NEXT:    cmp r2, #0
+; CHECK-BE-NEXT:    movlt r3, #1
+; CHECK-BE-NEXT:    cmp r3, #0
 ; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    mvnne r2, #1
-; CHECK-BE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-BE-NEXT:    and r1, r2, #3
-; CHECK-BE-NEXT:    lsls r2, r2, #30
+; CHECK-BE-NEXT:    mvnne r3, #1
+; CHECK-BE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-BE-NEXT:    and r1, r3, #3
+; CHECK-BE-NEXT:    lsls r2, r3, #30
 ; CHECK-BE-NEXT:    bpl .LBB49_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    vldr d1, .LCPI49_0
@@ -1828,7 +1828,7 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne d1, [r0, #8]
 ; CHECK-BE-NEXT:    add sp, #4
-; CHECK-BE-NEXT:    bx lr
+; CHECK-BE-NEXT:    pop {r7, pc}
 ; CHECK-BE-NEXT:    .p2align 3
 ; CHECK-BE-NEXT:  @ %bb.4:
 ; CHECK-BE-NEXT:  .LCPI49_0:
@@ -1843,28 +1843,28 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) {
 ; CHECK-LE-LABEL: masked_v2f64_align4_zero:
 ; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmov r1, s5
-; CHECK-LE-NEXT:    vmov r12, s7
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    vmov r3, s6
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r1
+; CHECK-LE-NEXT:    vmov r1, r2, d2
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    vmov lr, r12, d3
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r3, r2
 ; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r3, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r12
+; CHECK-LE-NEXT:    rsbs.w r2, lr, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r2, #1
-; CHECK-LE-NEXT:    cmp r2, #0
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r2, #3
-; CHECK-LE-NEXT:    lsls r2, r2, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r3, #3
+; CHECK-LE-NEXT:    lsls r2, r3, #31
 ; CHECK-LE-NEXT:    beq .LBB50_2
 ; CHECK-LE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-LE-NEXT:    vldr d1, .LCPI50_0
@@ -1877,7 +1877,7 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%des
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vldrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    bx lr
+; CHECK-LE-NEXT:    pop {r7, pc}
 ; CHECK-LE-NEXT:    .p2align 3
 ; CHECK-LE-NEXT:  @ %bb.4:
 ; CHECK-LE-NEXT:  .LCPI50_0:
@@ -1886,29 +1886,29 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%des
 ;
 ; CHECK-BE-LABEL: masked_v2f64_align4_zero:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    vmov r3, s3
-; CHECK-BE-NEXT:    vmov r1, s2
-; CHECK-BE-NEXT:    vmov r12, s0
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    vmov r3, s1
-; CHECK-BE-NEXT:    sbcs.w r1, r2, r1
+; CHECK-BE-NEXT:    movs r3, #0
+; CHECK-BE-NEXT:    vmov r1, r2, d1
+; CHECK-BE-NEXT:    vmov r12, lr, d0
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
-; CHECK-BE-NEXT:    rsbs r3, r3, #0
-; CHECK-BE-NEXT:    sbcs.w r3, r2, r12
+; CHECK-BE-NEXT:    rsbs.w r2, lr, #0
+; CHECK-BE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-BE-NEXT:    it lt
-; CHECK-BE-NEXT:    movlt r2, #1
-; CHECK-BE-NEXT:    cmp r2, #0
+; CHECK-BE-NEXT:    movlt r3, #1
+; CHECK-BE-NEXT:    cmp r3, #0
 ; CHECK-BE-NEXT:    it ne
-; CHECK-BE-NEXT:    mvnne r2, #1
-; CHECK-BE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-BE-NEXT:    and r1, r2, #3
-; CHECK-BE-NEXT:    lsls r2, r2, #30
+; CHECK-BE-NEXT:    mvnne r3, #1
+; CHECK-BE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-BE-NEXT:    and r1, r3, #3
+; CHECK-BE-NEXT:    lsls r2, r3, #30
 ; CHECK-BE-NEXT:    bpl .LBB50_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    vldr d1, .LCPI50_0
@@ -1921,7 +1921,7 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%des
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vldrne d1, [r0, #8]
 ; CHECK-BE-NEXT:    add sp, #4
-; CHECK-BE-NEXT:    bx lr
+; CHECK-BE-NEXT:    pop {r7, pc}
 ; CHECK-BE-NEXT:    .p2align 3
 ; CHECK-BE-NEXT:  @ %bb.4:
 ; CHECK-BE-NEXT:  .LCPI50_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index 8c94742b7f00a..02b29769382b2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -935,19 +935,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
 ; CHECK-LE-LABEL: masked_v2i64:
 ; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov r2, s0
+; CHECK-LE-NEXT:    vmov r1, r2, d0
 ; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    vmov r1, s1
-; CHECK-LE-NEXT:    vmov r12, s3
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    vmov r2, s2
-; CHECK-LE-NEXT:    sbcs.w r1, r3, r1
+; CHECK-LE-NEXT:    vmov lr, r12, d1
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r3, r2
 ; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    rsbs.w r2, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r3, #1
@@ -963,24 +963,24 @@ define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    bx lr
+; CHECK-LE-NEXT:    pop {r7, pc}
 ;
 ; CHECK-BE-LABEL: masked_v2i64:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    movs r3, #0
-; CHECK-BE-NEXT:    vmov r2, s7
-; CHECK-BE-NEXT:    vmov r1, s6
-; CHECK-BE-NEXT:    vmov r12, s4
+; CHECK-BE-NEXT:    vmov r1, r2, d3
+; CHECK-BE-NEXT:    vmov r12, lr, d2
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    vmov r2, s5
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    rsbs.w r2, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r3, #1
@@ -996,7 +996,7 @@ define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vstrne d1, [r0, #8]
 ; CHECK-BE-NEXT:    add sp, #4
-; CHECK-BE-NEXT:    bx lr
+; CHECK-BE-NEXT:    pop {r7, pc}
 entry:
   %c = icmp sgt <2 x i64> %a, zeroinitializer
   call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %a, <2 x i64>* %dest, i32 8, <2 x i1> %c)
@@ -1006,19 +1006,19 @@ entry:
 define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) {
 ; CHECK-LE-LABEL: masked_v2f64:
 ; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov r2, s4
+; CHECK-LE-NEXT:    vmov r1, r2, d2
 ; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    vmov r1, s5
-; CHECK-LE-NEXT:    vmov r12, s7
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
-; CHECK-LE-NEXT:    vmov r2, s6
-; CHECK-LE-NEXT:    sbcs.w r1, r3, r1
+; CHECK-LE-NEXT:    vmov lr, r12, d3
+; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r3, r2
 ; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    rsbs.w r2, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r3, #1
@@ -1034,24 +1034,24 @@ define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a,
 ; CHECK-LE-NEXT:    it mi
 ; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    bx lr
+; CHECK-LE-NEXT:    pop {r7, pc}
 ;
 ; CHECK-BE-LABEL: masked_v2f64:
 ; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    movs r3, #0
-; CHECK-BE-NEXT:    vmov r2, s11
-; CHECK-BE-NEXT:    vmov r1, s10
-; CHECK-BE-NEXT:    vmov r12, s8
+; CHECK-BE-NEXT:    vmov r1, r2, d5
+; CHECK-BE-NEXT:    vmov r12, lr, d4
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
-; CHECK-BE-NEXT:    vmov r2, s9
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
-; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    rsbs.w r2, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r3, #1
@@ -1067,7 +1067,7 @@ define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a,
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    vstrne d1, [r0, #8]
 ; CHECK-BE-NEXT:    add sp, #4
-; CHECK-BE-NEXT:    bx lr
+; CHECK-BE-NEXT:    pop {r7, pc}
 entry:
   %c = icmp sgt <2 x i64> %b, zeroinitializer
   call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %a, <2 x double>* %dest, i32 8, <2 x i1> %c)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index 1b45d27317acd..6b2939c3a0c1b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -38,36 +38,32 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-LABEL: smin_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r12, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    sbcs.w r1, r1, r12
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r12
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r12, lr, d2
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    subs.w r1, r4, r12
+; CHECK-NEXT:    sbcs.w r1, r5, lr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = icmp slt <2 x i64> %s1, %s2
   %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2
@@ -110,36 +106,32 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-LABEL: umin_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r12, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    sbcs.w r1, r1, r12
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r12
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r12, lr, d2
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    subs.w r1, r4, r12
+; CHECK-NEXT:    sbcs.w r1, r5, lr
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = icmp ult <2 x i64> %s1, %s2
   %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2
@@ -183,36 +175,32 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-LABEL: smax_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov lr, s5
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sbcs.w r1, r1, r12
-; CHECK-NEXT:    vmov r12, s1
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r12
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    vmov r12, lr, d0
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    subs.w r1, r4, r12
+; CHECK-NEXT:    sbcs.w r1, r5, lr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = icmp sgt <2 x i64> %s1, %s2
   %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2
@@ -255,36 +243,32 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-LABEL: umax_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov lr, s5
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sbcs.w r1, r1, r12
-; CHECK-NEXT:    vmov r12, s1
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r12
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    vmov r12, lr, d0
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    subs.w r1, r4, r12
+; CHECK-NEXT:    sbcs.w r1, r5, lr
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = icmp ugt <2 x i64> %s1, %s2
   %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-neg.ll b/llvm/test/CodeGen/Thumb2/mve-neg.ll
index ea4ef0921a97b..ef14d7599a7b2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-neg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-neg.ll
@@ -34,17 +34,15 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @neg_v2i64(<2 x i64> %s1) {
 ; CHECK-LABEL: neg_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    sbc.w r0, r12, r0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    sbc.w r3, r12, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    sbc.w r1, r12, r1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    sbc.w r2, r12, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <2 x i64> zeroinitializer, %s1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
index 76b7d87544f1e..24eef30656e95 100644
--- a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
@@ -102,31 +102,31 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-NOFP-LABEL: vector_add_f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    .save {r7, lr}
-; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NOFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NOFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    vmov q4, q1
+; CHECK-NOFP-NEXT:    vmov q5, q1
 ; CHECK-NOFP-NEXT:    vmov q6, q0
-; CHECK-NOFP-NEXT:    vmov r0, s27
-; CHECK-NOFP-NEXT:    vmov r1, s19
+; CHECK-NOFP-NEXT:    vmov r4, r0, d13
+; CHECK-NOFP-NEXT:    vmov r5, r1, d11
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
-; CHECK-NOFP-NEXT:    vmov s23, r0
-; CHECK-NOFP-NEXT:    vmov r0, s26
-; CHECK-NOFP-NEXT:    vmov r1, s18
+; CHECK-NOFP-NEXT:    vmov s19, r0
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
-; CHECK-NOFP-NEXT:    vmov s22, r0
-; CHECK-NOFP-NEXT:    vmov r0, s25
-; CHECK-NOFP-NEXT:    vmov r1, s17
+; CHECK-NOFP-NEXT:    vmov s18, r0
+; CHECK-NOFP-NEXT:    vmov r4, r0, d12
+; CHECK-NOFP-NEXT:    vmov r5, r1, d10
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
-; CHECK-NOFP-NEXT:    vmov s21, r0
-; CHECK-NOFP-NEXT:    vmov r0, s24
-; CHECK-NOFP-NEXT:    vmov r1, s16
+; CHECK-NOFP-NEXT:    vmov s17, r0
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
-; CHECK-NOFP-NEXT:    vmov s20, r0
-; CHECK-NOFP-NEXT:    vmov q0, q5
+; CHECK-NOFP-NEXT:    vmov s16, r0
+; CHECK-NOFP-NEXT:    vmov q0, q4
 ; CHECK-NOFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    pop {r7, pc}
+; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:
 ; CHECK-FP:       @ %bb.0: @ %entry

diff  --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 252f9c6439ec9..b746dc87cb8a6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -6,54 +6,52 @@
 define arm_aapcs_vfpcc void @k() {
 ; CHECK-LABEL: k:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    adr r5, .LCPI0_0
-; CHECK-NEXT:    adr r4, .LCPI0_1
-; CHECK-NEXT:    vldrw.u32 q5, [r5]
-; CHECK-NEXT:    vldrw.u32 q6, [r4]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    adr.w r8, .LCPI0_0
+; CHECK-NEXT:    adr.w r9, .LCPI0_1
+; CHECK-NEXT:    vldrw.u32 q6, [r8]
+; CHECK-NEXT:    vldrw.u32 q5, [r9]
 ; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.i16 q3, #0x6
 ; CHECK-NEXT:    vmov.i16 q4, #0x3
-; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vand q5, q5, q0
 ; CHECK-NEXT:    vand q6, q6, q0
+; CHECK-NEXT:    vand q5, q5, q0
+; CHECK-NEXT:    vcmp.i32 eq, q6, zr
+; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    vpsel q6, q2, q1
 ; CHECK-NEXT:    vcmp.i32 eq, q5, zr
 ; CHECK-NEXT:    vpsel q5, q2, q1
-; CHECK-NEXT:    vcmp.i32 eq, q6, zr
-; CHECK-NEXT:    vpsel q7, q2, q1
-; CHECK-NEXT:    vmov r1, s28
-; CHECK-NEXT:    vmov.16 q6[0], r1
-; CHECK-NEXT:    vmov r1, s29
-; CHECK-NEXT:    vmov.16 q6[1], r1
-; CHECK-NEXT:    vmov r1, s30
-; CHECK-NEXT:    vmov.16 q6[2], r1
-; CHECK-NEXT:    vmov r1, s31
-; CHECK-NEXT:    vmov.16 q6[3], r1
-; CHECK-NEXT:    vmov r1, s20
-; CHECK-NEXT:    vmov.16 q6[4], r1
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov.16 q6[5], r1
-; CHECK-NEXT:    vmov r1, s22
-; CHECK-NEXT:    vmov.16 q6[6], r1
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov.16 q6[7], r1
-; CHECK-NEXT:    vcmp.i16 ne, q6, zr
-; CHECK-NEXT:    vmov.i32 q6, #0x0
-; CHECK-NEXT:    vpsel q5, q4, q3
-; CHECK-NEXT:    vstrh.16 q5, [r0]
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    cbz r0, .LBB0_2
-; CHECK-NEXT:    le .LBB0_1
-; CHECK-NEXT:  .LBB0_2: @ %for.cond4.preheader
+; CHECK-NEXT:    vmov r4, r0, d12
+; CHECK-NEXT:    vmov r3, r6, d10
+; CHECK-NEXT:    vmov r1, r2, d11
+; CHECK-NEXT:    vmov.16 q5[0], r3
+; CHECK-NEXT:    vmov.16 q5[1], r6
+; CHECK-NEXT:    vmov r5, r7, d13
+; CHECK-NEXT:    vmov.16 q5[2], r1
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.16 q5[4], r4
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.16 q5[6], r5
+; CHECK-NEXT:    vmov.16 q5[7], r7
+; CHECK-NEXT:    vcmp.i16 ne, q5, zr
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vpsel q6, q4, q3
+; CHECK-NEXT:    vstrh.16 q6, [r0]
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond4.preheader
 ; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    cbnz r6, .LBB0_5
 ; CHECK-NEXT:  .LBB0_3: @ %for.body10
@@ -63,8 +61,8 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:  .LBB0_4: @ %for.cond4.loopexit
 ; CHECK-NEXT:    bl l
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body105.preheader
-; CHECK-NEXT:    vldrw.u32 q0, [r5]
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r8]
+; CHECK-NEXT:    vldrw.u32 q1, [r9]
 ; CHECK-NEXT:    vmov.i32 q2, #0x8
 ; CHECK-NEXT:  .LBB0_6: @ %vector.body105
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -73,7 +71,7 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    cbz r6, .LBB0_7
 ; CHECK-NEXT:    le .LBB0_6
 ; CHECK-NEXT:  .LBB0_7: @ %vector.body115.ph
-; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r9]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    @APP
 ; CHECK-NEXT:    nop

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
index 1a3993097fa85..0383bac87062d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
@@ -575,11 +575,9 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q2, q0, q1
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -604,33 +602,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r12, r2, d4
+; CHECK-NEXT:    vmov r3, r1, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, r3, r12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -656,29 +648,25 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c) {
 ; CHECK-LABEL: cmpeqr_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    eors r2, r1
-; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    eors r3, r1
+; CHECK-NEXT:    eors r2, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r12, r3, d2
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
 ; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    eor.w r0, r0, r12
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
index 53f2ea04674f5..e10d984dd39fa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
@@ -389,11 +389,9 @@ define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) {
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    vmov r0, s1
-; CHECK-LE-NEXT:    vmov r1, s0
-; CHECK-LE-NEXT:    vmov r2, s2
+; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    orrs r0, r1
-; CHECK-LE-NEXT:    vmov r1, s3
+; CHECK-LE-NEXT:    vmov r1, r2, d1
 ; CHECK-LE-NEXT:    cset r0, eq
 ; CHECK-LE-NEXT:    orrs r1, r2
 ; CHECK-LE-NEXT:    cset r1, eq
@@ -410,11 +408,9 @@ define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) {
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmov r0, s6
-; CHECK-BE-NEXT:    vmov r1, s7
-; CHECK-BE-NEXT:    vmov r2, s5
+; CHECK-BE-NEXT:    vmov r0, r1, d3
 ; CHECK-BE-NEXT:    orrs r0, r1
-; CHECK-BE-NEXT:    vmov r1, s4
+; CHECK-BE-NEXT:    vmov r1, r2, d2
 ; CHECK-BE-NEXT:    cset r0, eq
 ; CHECK-BE-NEXT:    orrs r1, r2
 ; CHECK-BE-NEXT:    cset r1, eq

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index 9fe502a26bbc8..df468608e0eae 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -46,23 +46,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: sext_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbcs.w r0, r2, r0
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    vmov r2, r12, d0
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    sbcs.w r1, r2, r1
+; CHECK-NEXT:    rsbs r1, r2, #0
+; CHECK-NEXT:    sbcs.w r1, r3, r12
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #1
-; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
@@ -119,29 +117,29 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: zext_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adr r1, .LCPI7_0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    sbcs.w r1, r0, r1
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    sbcs.w r2, r0, r2
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    vmov lr, r12, d0
+; CHECK-NEXT:    adr r2, .LCPI7_0
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    rsbs.w r1, lr, #0
+; CHECK-NEXT:    sbcs.w r1, r3, r12
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI7_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
index d859e20ffd614..da553cc96685c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
@@ -318,11 +318,9 @@ entry:
 define arm_aapcs_vfpcc void @store_v2i1(<2 x i1> *%dst, <2 x i64> %a) {
 ; CHECK-LE-LABEL: store_v2i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov r1, s1
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov r3, s2
+; CHECK-LE-NEXT:    vmov r1, r2, d0
 ; CHECK-LE-NEXT:    orrs r1, r2
-; CHECK-LE-NEXT:    vmov r2, s3
+; CHECK-LE-NEXT:    vmov r2, r3, d1
 ; CHECK-LE-NEXT:    cset r1, eq
 ; CHECK-LE-NEXT:    orrs r2, r3
 ; CHECK-LE-NEXT:    cset r2, eq
@@ -337,11 +335,9 @@ define arm_aapcs_vfpcc void @store_v2i1(<2 x i1> *%dst, <2 x i64> %a) {
 ; CHECK-BE-LABEL: store_v2i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmov r1, s6
-; CHECK-BE-NEXT:    vmov r2, s7
-; CHECK-BE-NEXT:    vmov r3, s5
+; CHECK-BE-NEXT:    vmov r1, r2, d3
 ; CHECK-BE-NEXT:    orrs r1, r2
-; CHECK-BE-NEXT:    vmov r2, s4
+; CHECK-BE-NEXT:    vmov r2, r3, d2
 ; CHECK-BE-NEXT:    cset r1, eq
 ; CHECK-BE-NEXT:    orrs r2, r3
 ; CHECK-BE-NEXT:    cset r2, eq

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
index 4ddc86632a991..51b5cf528baef 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
@@ -323,11 +323,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -351,11 +349,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
index 37fe00057b061..cbe3172c04950 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
@@ -377,25 +377,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -421,33 +417,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r12, r2, d4
+; CHECK-NEXT:    vmov r3, r1, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, r3, r12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index b04be5e25f021..e9ae87165b455 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -459,27 +459,23 @@ define <8 x i16> @shuffle6_v4i32(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> %a,
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vcmp.i32 eq, q0, zr
 ; CHECK-NEXT:    vpsel q3, q2, q1
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d7
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q0[3], r0
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.16 q0[3], r1
 ; CHECK-NEXT:    vcmp.i32 eq, q3, zr
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
 ; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    add r0, sp, #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
index 9fba17f421154..715e956fcd686 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
@@ -72,40 +72,34 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    vbic q3, q3, q2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -202,40 +196,34 @@ define arm_aapcs_vfpcc <2 x i64> @cmpnez_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    vbic q3, q3, q2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -441,25 +429,21 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1_i1(<2 x i64> %a, <2 x i64> %b, i64
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r2, r3, d3
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r2, r3, d2
 ; CHECK-NEXT:    csetm r12, ne
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    csetm r4, ne
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    csetm lr, ne
 ; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    cset r2, eq

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
index 6f336eb33658c..98a0321ccbb88 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
@@ -457,25 +457,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -501,33 +497,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r12, r2, d4
+; CHECK-NEXT:    vmov r3, r1, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, r3, r12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 180fc78db665d..feb784b16fd21 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -68,20 +68,18 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vbic q3, q0, q2
 ; CHECK-NEXT:    vand q2, q4, q2
 ; CHECK-NEXT:    vorr q2, q2, q3
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    subs r4, r4, r6
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    subs r3, r3, r6
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    vmov r4, r5, d5
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs r5, r5, r6
 ; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    subs r4, r4, r6
+; CHECK-NEXT:    sbcs r4, r5, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
@@ -260,19 +258,17 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
 ; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    vmov r7, s27
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r4, r7, d13
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov r10, s24
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    vmov.f32 s10, s9
-; CHECK-NEXT:    sbcs.w r5, r2, r7
+; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    sbcs.w r5, r2, r7
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    vmov r5, s25
+; CHECK-NEXT:    vmov r10, r5, d12
 ; CHECK-NEXT:    csetm r8, ne
 ; CHECK-NEXT:    asrl r10, r5, #31
 ; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
@@ -290,21 +286,19 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vbic q5, q0, q4
 ; CHECK-NEXT:    vand q4, q6, q4
 ; CHECK-NEXT:    vorr q4, q4, q5
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r3, r4, d8
+; CHECK-NEXT:    subs.w r3, r3, r8
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    vmov r4, r5, d9
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs.w r5, r5, r8
 ; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r4, r5, #0
 ; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    sbcs r4, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
@@ -313,10 +307,10 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
 ; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    vbic q6, q1, q5
 ; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vorr q4, q4, q6
+; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    smull r4, r7, r4, r3
 ; CHECK-NEXT:    asrl r4, r7, #31
@@ -342,20 +336,18 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vbic q3, q0, q2
 ; CHECK-NEXT:    vand q2, q5, q2
 ; CHECK-NEXT:    vorr q2, q2, q3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    subs.w r3, r3, r8
-; CHECK-NEXT:    sbcs r3, r4, #0
-; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r4, r3, d4
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    vmov r3, r4, d5
 ; CHECK-NEXT:    csetm r5, ne
 ; CHECK-NEXT:    vmov.32 q3[1], r5
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    subs.w r3, r3, r8
+; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
@@ -538,18 +530,16 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.f32 s30, s23
 ; CHECK-NEXT:    vmullb.s32 q0, q7, q6
 ; CHECK-NEXT:    vmov.f32 s18, s17
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    vmov.f32 s22, s21
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    sbcs.w r7, r12, r5
 ; CHECK-NEXT:    mov.w r7, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r7, #1
 ; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    vmov r7, s1
+; CHECK-NEXT:    vmov r4, r7, d0
 ; CHECK-NEXT:    csetm r10, ne
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
@@ -567,20 +557,18 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vbic q6, q2, q0
 ; CHECK-NEXT:    vand q0, q7, q0
 ; CHECK-NEXT:    vorr q6, q0, q6
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    vmov r5, s26
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s27
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r3, r4, d12
+; CHECK-NEXT:    subs.w r3, r3, r8
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    vmov r4, r5, d13
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs.w r5, r5, r8
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r4, r5, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
@@ -618,20 +606,18 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vbic q4, q2, q0
 ; CHECK-NEXT:    vand q0, q5, q0
 ; CHECK-NEXT:    vorr q4, q0, q4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r3, r4, d8
+; CHECK-NEXT:    subs.w r3, r3, r8
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    vmov r4, r5, d9
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs.w r5, r5, r8
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r4, r5, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
@@ -901,14 +887,12 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.f32 s14, s11
 ; CHECK-NEXT:    vmullb.u32 q4, q3, q1
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r5, s17
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r4, r5, d8
 ; CHECK-NEXT:    lsrl r4, r5, #31
-; CHECK-NEXT:    vmov r7, s19
-; CHECK-NEXT:    subs.w r6, r4, #-1
 ; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    subs.w r6, r4, #-1
 ; CHECK-NEXT:    sbcs r5, r5, #0
-; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    vmov r6, r7, d9
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
@@ -928,13 +912,11 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vorn q1, q3, q1
 ; CHECK-NEXT:    vmullb.u32 q3, q2, q0
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r4, r5, d6
 ; CHECK-NEXT:    lsrl r4, r5, #31
-; CHECK-NEXT:    vmov r7, s15
 ; CHECK-NEXT:    subs.w r6, r4, #-1
 ; CHECK-NEXT:    sbcs r5, r5, #0
-; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r6, r7, d7
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
@@ -1566,23 +1548,19 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
 ; CHECK-NEXT:    vpsel q7, q3, q2
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q6
-; CHECK-NEXT:    vmov r4, s28
+; CHECK-NEXT:    vmov r4, r12, d14
 ; CHECK-NEXT:    vpsel q6, q3, q2
 ; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s29
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s30
+; CHECK-NEXT:    vmov.16 q5[1], r12
+; CHECK-NEXT:    vmov r4, r12, d15
 ; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s31
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov.16 q5[3], r12
+; CHECK-NEXT:    vmov r4, r12, d12
 ; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov.16 q5[5], r12
+; CHECK-NEXT:    vmov r4, r12, d13
 ; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s27
-; CHECK-NEXT:    vmov.16 q5[7], r4
+; CHECK-NEXT:    vmov.16 q5[7], r12
 ; CHECK-NEXT:    vptt.i16 ne, q5, zr
 ; CHECK-NEXT:    vldrht.u16 q5, [r0], #16
 ; CHECK-NEXT:    vldrht.u16 q6, [r1], #16
@@ -1685,23 +1663,19 @@ define arm_aapcs_vfpcc void @ssatmul_8ti_q15(i16* nocapture readonly %pSrcA, i16
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
 ; CHECK-NEXT:    vpsel q7, q3, q2
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q6
-; CHECK-NEXT:    vmov r4, s28
+; CHECK-NEXT:    vmov r4, r12, d14
 ; CHECK-NEXT:    vpsel q6, q3, q2
 ; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s29
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s30
+; CHECK-NEXT:    vmov.16 q5[1], r12
+; CHECK-NEXT:    vmov r4, r12, d15
 ; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s31
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov.16 q5[3], r12
+; CHECK-NEXT:    vmov r4, r12, d12
 ; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov.16 q5[5], r12
+; CHECK-NEXT:    vmov r4, r12, d13
 ; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s27
-; CHECK-NEXT:    vmov.16 q5[7], r4
+; CHECK-NEXT:    vmov.16 q5[7], r12
 ; CHECK-NEXT:    vptt.i16 ne, q5, zr
 ; CHECK-NEXT:    vldrht.u16 q5, [r0], #16
 ; CHECK-NEXT:    vldrht.u16 q6, [r1], #16
@@ -2601,23 +2575,19 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
 ; CHECK-NEXT:    vpsel q7, q3, q2
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q6
-; CHECK-NEXT:    vmov r4, s28
+; CHECK-NEXT:    vmov r4, r12, d14
 ; CHECK-NEXT:    vpsel q6, q3, q2
 ; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s29
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s30
+; CHECK-NEXT:    vmov.16 q5[1], r12
+; CHECK-NEXT:    vmov r4, r12, d15
 ; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s31
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov.16 q5[3], r12
+; CHECK-NEXT:    vmov r4, r12, d12
 ; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov.16 q5[5], r12
+; CHECK-NEXT:    vmov r4, r12, d13
 ; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s27
-; CHECK-NEXT:    vmov.16 q5[7], r4
+; CHECK-NEXT:    vmov.16 q5[7], r12
 ; CHECK-NEXT:    vptt.i16 ne, q5, zr
 ; CHECK-NEXT:    vldrbt.s16 q5, [r0], #8
 ; CHECK-NEXT:    vldrbt.s16 q6, [r1], #8
@@ -2688,12 +2658,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_16t_q7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #56
-; CHECK-NEXT:    sub sp, #56
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
@@ -2704,14 +2674,14 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    sub.w r12, r12, #16
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI18_1
-; CHECK-NEXT:    vmov.i8 q2, #0x0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
+; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_3
@@ -2721,90 +2691,82 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    vdup.32 q0, r5
+; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r4, r3, d8
 ; CHECK-NEXT:    vmov.16 q7[0], r4
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov.16 q7[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    vmov.16 q7[2], r4
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov.16 q7[1], r3
+; CHECK-NEXT:    vmov r3, r4, d9
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q7[2], r3
 ; CHECK-NEXT:    vmov.16 q7[3], r4
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov.16 q7[4], r4
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r3, r4, d8
+; CHECK-NEXT:    vmov.16 q7[4], r3
 ; CHECK-NEXT:    vmov.16 q7[5], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    vmov.16 q7[6], r4
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r3, r4, d9
+; CHECK-NEXT:    vmov.16 q7[6], r3
 ; CHECK-NEXT:    vmov.16 q7[7], r4
 ; CHECK-NEXT:    vcmp.i16 ne, q7, zr
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q4[0]
-; CHECK-NEXT:    vmov.8 q7[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[1]
-; CHECK-NEXT:    vmov.8 q7[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[2]
-; CHECK-NEXT:    vmov.8 q7[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[3]
-; CHECK-NEXT:    vmov.8 q7[3], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[4]
-; CHECK-NEXT:    vmov.8 q7[4], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[5]
-; CHECK-NEXT:    vmov.8 q7[5], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[6]
-; CHECK-NEXT:    vmov.8 q7[6], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[7]
+; CHECK-NEXT:    vmov.u16 r3, q4[0]
+; CHECK-NEXT:    vmov.8 q7[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[1]
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[2]
+; CHECK-NEXT:    vmov.8 q7[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[3]
+; CHECK-NEXT:    vmov.8 q7[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[4]
+; CHECK-NEXT:    vmov.8 q7[4], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[5]
+; CHECK-NEXT:    vmov.8 q7[5], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[6]
+; CHECK-NEXT:    vmov.8 q7[6], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[7]
 ; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[7], r4
+; CHECK-NEXT:    vmov.8 q7[7], r3
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vorr q0, q0, q6
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q0
-; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, r4, d10
 ; CHECK-NEXT:    vpsel q0, q3, q2
-; CHECK-NEXT:    vmov.16 q4[0], r4
-; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    vmov.16 q4[0], r3
 ; CHECK-NEXT:    vmov.16 q4[1], r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov.16 q4[2], r4
-; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    vmov r3, r4, d11
+; CHECK-NEXT:    vmov.16 q4[2], r3
 ; CHECK-NEXT:    vmov.16 q4[3], r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q4[4], r4
-; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    vmov.16 q4[4], r3
 ; CHECK-NEXT:    vmov.16 q4[5], r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.16 q4[6], r4
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r3, r4, d1
+; CHECK-NEXT:    vmov.16 q4[6], r3
 ; CHECK-NEXT:    vmov.16 q4[7], r4
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q0, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q0[0]
-; CHECK-NEXT:    vmov.8 q7[8], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[1]
-; CHECK-NEXT:    vmov.8 q7[9], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[2]
-; CHECK-NEXT:    vmov.8 q7[10], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[3]
-; CHECK-NEXT:    vmov.8 q7[11], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[4]
-; CHECK-NEXT:    vmov.8 q7[12], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    vmov.8 q7[13], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[6]
-; CHECK-NEXT:    vmov.8 q7[14], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[7]
-; CHECK-NEXT:    vmov.8 q7[15], r4
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov.8 q7[8], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[1]
+; CHECK-NEXT:    vmov.8 q7[9], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov.8 q7[10], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[3]
+; CHECK-NEXT:    vmov.8 q7[11], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vmov.8 q7[12], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[5]
+; CHECK-NEXT:    vmov.8 q7[13], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    vmov.8 q7[14], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[7]
+; CHECK-NEXT:    vmov.8 q7[15], r3
 ; CHECK-NEXT:    vptt.i8 ne, q7, zr
 ; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
 ; CHECK-NEXT:    vldrbt.u8 q4, [r1], #16
@@ -2816,9 +2778,9 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    vstrbt.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB18_2
 ; CHECK-NEXT:  .LBB18_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #56
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI18_0:
@@ -2888,12 +2850,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_16ti_q7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #56
-; CHECK-NEXT:    sub sp, #56
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB19_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
@@ -2904,14 +2866,14 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:    sub.w r12, r12, #16
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI19_1
-; CHECK-NEXT:    vmov.i8 q2, #0x0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
+; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_3
@@ -2921,90 +2883,82 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    vdup.32 q0, r5
+; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r4, r3, d8
 ; CHECK-NEXT:    vmov.16 q7[0], r4
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov.16 q7[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    vmov.16 q7[2], r4
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov.16 q7[1], r3
+; CHECK-NEXT:    vmov r3, r4, d9
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q7[2], r3
 ; CHECK-NEXT:    vmov.16 q7[3], r4
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov.16 q7[4], r4
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r3, r4, d8
+; CHECK-NEXT:    vmov.16 q7[4], r3
 ; CHECK-NEXT:    vmov.16 q7[5], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    vmov.16 q7[6], r4
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r3, r4, d9
+; CHECK-NEXT:    vmov.16 q7[6], r3
 ; CHECK-NEXT:    vmov.16 q7[7], r4
 ; CHECK-NEXT:    vcmp.i16 ne, q7, zr
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q4[0]
-; CHECK-NEXT:    vmov.8 q7[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[1]
-; CHECK-NEXT:    vmov.8 q7[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[2]
-; CHECK-NEXT:    vmov.8 q7[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[3]
-; CHECK-NEXT:    vmov.8 q7[3], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[4]
-; CHECK-NEXT:    vmov.8 q7[4], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[5]
-; CHECK-NEXT:    vmov.8 q7[5], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[6]
-; CHECK-NEXT:    vmov.8 q7[6], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[7]
+; CHECK-NEXT:    vmov.u16 r3, q4[0]
+; CHECK-NEXT:    vmov.8 q7[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[1]
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[2]
+; CHECK-NEXT:    vmov.8 q7[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[3]
+; CHECK-NEXT:    vmov.8 q7[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[4]
+; CHECK-NEXT:    vmov.8 q7[4], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[5]
+; CHECK-NEXT:    vmov.8 q7[5], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[6]
+; CHECK-NEXT:    vmov.8 q7[6], r3
+; CHECK-NEXT:    vmov.u16 r3, q4[7]
 ; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[7], r4
+; CHECK-NEXT:    vmov.8 q7[7], r3
 ; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vorr q0, q0, q6
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q0
-; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, r4, d10
 ; CHECK-NEXT:    vpsel q0, q3, q2
-; CHECK-NEXT:    vmov.16 q4[0], r4
-; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    vmov.16 q4[0], r3
 ; CHECK-NEXT:    vmov.16 q4[1], r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov.16 q4[2], r4
-; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    vmov r3, r4, d11
+; CHECK-NEXT:    vmov.16 q4[2], r3
 ; CHECK-NEXT:    vmov.16 q4[3], r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q4[4], r4
-; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    vmov.16 q4[4], r3
 ; CHECK-NEXT:    vmov.16 q4[5], r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.16 q4[6], r4
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r3, r4, d1
+; CHECK-NEXT:    vmov.16 q4[6], r3
 ; CHECK-NEXT:    vmov.16 q4[7], r4
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q0, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q0[0]
-; CHECK-NEXT:    vmov.8 q7[8], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[1]
-; CHECK-NEXT:    vmov.8 q7[9], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[2]
-; CHECK-NEXT:    vmov.8 q7[10], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[3]
-; CHECK-NEXT:    vmov.8 q7[11], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[4]
-; CHECK-NEXT:    vmov.8 q7[12], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    vmov.8 q7[13], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[6]
-; CHECK-NEXT:    vmov.8 q7[14], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[7]
-; CHECK-NEXT:    vmov.8 q7[15], r4
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov.8 q7[8], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[1]
+; CHECK-NEXT:    vmov.8 q7[9], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov.8 q7[10], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[3]
+; CHECK-NEXT:    vmov.8 q7[11], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vmov.8 q7[12], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[5]
+; CHECK-NEXT:    vmov.8 q7[13], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    vmov.8 q7[14], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[7]
+; CHECK-NEXT:    vmov.8 q7[15], r3
 ; CHECK-NEXT:    vptt.i8 ne, q7, zr
 ; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
 ; CHECK-NEXT:    vldrbt.u8 q4, [r1], #16
@@ -3016,9 +2970,9 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:    vstrbt.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB19_2
 ; CHECK-NEXT:  .LBB19_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #56
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI19_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index 105330c0e54c6..c960979a37033 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -36,48 +36,44 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    eor.w r12, r1, r0
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    eors r1, r0
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds r2, r2, r0
+; CHECK-NEXT:    eor.w r12, r3, r1
+; CHECK-NEXT:    adc.w r0, r3, r1
+; CHECK-NEXT:    eor.w r1, r3, r0
+; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    bic.w r1, r1, r12
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov lr, r1, d2
 ; CHECK-NEXT:    cset r12, mi
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r2, r0, #31
-; CHECK-NEXT:    adds r4, r4, r5
-; CHECK-NEXT:    mvn r5, #-2147483648
-; CHECK-NEXT:    eor.w lr, r1, r3
-; CHECK-NEXT:    adcs r3, r1
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    bic.w r1, r1, lr
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    eor.w r5, r4, r1
+; CHECK-NEXT:    adcs r1, r4
+; CHECK-NEXT:    eors r4, r1
+; CHECK-NEXT:    bic.w r5, r4, r5
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    cset r5, mi
+; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r4, r3, #31
+; CHECK-NEXT:    asrne r3, r1, #31
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    cset r2, mi
+; CHECK-NEXT:    mvn r3, #-2147483648
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r5, eq
+; CHECK-NEXT:    cinv r2, r3, eq
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csel r0, r2, r0, ne
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r2, mi
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r5, eq
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csel r1, r2, r3, ne
+; CHECK-NEXT:    cinv r2, r3, eq
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r1, r2, r1, ne
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -120,33 +116,29 @@ define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r2, d2
 ; CHECK-NEXT:    adcs lr, r12, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    adds r4, r4, r5
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    adcs r3, r12, #0
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    adcs r5, r12, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r4, #-1
+; CHECK-NEXT:    movne.w r3, #-1
 ; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r2, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
@@ -189,46 +181,42 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    eor.w r12, r1, r0
-; CHECK-NEXT:    sbc.w r0, r1, r0
-; CHECK-NEXT:    eors r1, r0
-; CHECK-NEXT:    ands.w r1, r1, r12
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    eor.w r12, r3, r1
+; CHECK-NEXT:    sbc.w r1, r3, r1
+; CHECK-NEXT:    eor.w r2, r3, r1
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    ands.w r2, r2, r12
+; CHECK-NEXT:    vmov lr, r2, d2
 ; CHECK-NEXT:    cset r12, mi
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r2, r0, #31
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    mvn r5, #-2147483648
-; CHECK-NEXT:    eor.w lr, r1, r3
-; CHECK-NEXT:    sbc.w r3, r1, r3
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    ands.w r1, r1, lr
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    asrne r0, r1, #31
+; CHECK-NEXT:    subs.w r3, r3, lr
+; CHECK-NEXT:    eor.w r5, r4, r2
+; CHECK-NEXT:    sbc.w r2, r4, r2
+; CHECK-NEXT:    eors r4, r2
+; CHECK-NEXT:    ands r5, r4
+; CHECK-NEXT:    cset r5, mi
+; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r4, r3, #31
+; CHECK-NEXT:    asrne r3, r2, #31
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    cset r0, mi
+; CHECK-NEXT:    mvn r3, #-2147483648
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
-; CHECK-NEXT:    cset r2, mi
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r5, eq
+; CHECK-NEXT:    cinv r0, r3, eq
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r2, r0, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r2, mi
+; CHECK-NEXT:    csel r0, r0, r1, ne
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r5, eq
+; CHECK-NEXT:    cset r1, mi
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csel r1, r2, r3, ne
+; CHECK-NEXT:    cinv r1, r3, eq
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r1, r1, r2, ne
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -271,35 +259,31 @@ define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    sbcs.w r0, r1, r0
-; CHECK-NEXT:    adc r1, r12, #0
-; CHECK-NEXT:    rsbs.w lr, r1, #1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    adc r2, r12, #0
+; CHECK-NEXT:    rsbs.w lr, r2, #1
+; CHECK-NEXT:    vmov r3, r2, d2
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r2, #0
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    sbcs r1, r3
-; CHECK-NEXT:    adc r3, r12, #0
-; CHECK-NEXT:    rsbs.w r3, r3, #1
+; CHECK-NEXT:    movne r0, #0
+; CHECK-NEXT:    subs r3, r4, r3
+; CHECK-NEXT:    sbcs.w r2, r5, r2
+; CHECK-NEXT:    adc r5, r12, #0
+; CHECK-NEXT:    rsbs.w r5, r5, #1
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r4, #0
+; CHECK-NEXT:    movne r3, #0
 ; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #0
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r1, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne r2, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index 006413638205e..a6ebd586a3063 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -21,41 +21,36 @@ define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %ds
 define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) {
 ; CHECK-LABEL: scatter_inc_mini_8i16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vmov.i32 q3, #0x10
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vadd.i32 q4, q1, q3
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, q3
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r12, d3
 ; CHECK-NEXT:    vshl.i32 q1, q2, #1
-; CHECK-NEXT:    vmov r1, s16
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    vadd.i32 q1, q1, q3
-; CHECK-NEXT:    strh r2, [r1]
+; CHECK-NEXT:    vmov r0, lr, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r1]
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    strh r1, [r2]
 ; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    strh r1, [r3]
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    strh.w r1, [r12]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh.w r0, [lr]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
   %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1
   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
@@ -65,69 +60,66 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <
 define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) {
 ; CHECK-LABEL: scatter_inc_mini_16i8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov.i32 q5, #0x10
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov.i32 q4, #0x10
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q4, q1, q5
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vadd.i32 q1, q1, q4
+; CHECK-NEXT:    add r5, sp, #48
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, q5
-; CHECK-NEXT:    vadd.i32 q2, q2, q5
-; CHECK-NEXT:    strb r2, [r1]
-; CHECK-NEXT:    add r1, sp, #32
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov r3, r12, d3
+; CHECK-NEXT:    vadd.i32 q1, q2, r0
+; CHECK-NEXT:    vadd.i32 q2, q1, q4
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    vmov lr, r7, d4
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vadd.i32 q1, q1, q5
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov r0, r8, d5
+; CHECK-NEXT:    vadd.i32 q2, q3, q4
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vadd.i32 q1, q1, q4
+; CHECK-NEXT:    vmov.u8 r5, q0[6]
+; CHECK-NEXT:    strb r6, [r1]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    strb r1, [r2]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r1, r9, d4
+; CHECK-NEXT:    strb r6, [r3]
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    strb.w r3, [r12]
+; CHECK-NEXT:    vmov r3, r6, d5
+; CHECK-NEXT:    strb.w r4, [lr]
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    strb r4, [r7]
+; CHECK-NEXT:    vmov r7, r4, d2
+; CHECK-NEXT:    strb r5, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r8]
+; CHECK-NEXT:    vmov r0, r5, d3
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    strb.w r1, [r9]
 ; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    strb r1, [r3]
 ; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    strb r1, [r6]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r7]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r5]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
   %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1
   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
index 9c8405825f656..2638bf7760da0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -51,37 +51,35 @@ entry:
 define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: scaled_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.s32 q2, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vshl.i32 q2, q2, #1
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strh r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strh.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strh.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strh r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
   %offs.sext = sext <8 x i16> %offs to <8 x i32>
@@ -99,27 +97,23 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <
 ; CHECK-NEXT:    vshl.i32 q2, q1, #1
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmov r1, r2, d5
+; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vstr.16 s12, [r1]
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vstr.16 s8, [r1]
-; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vstr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vstr.16 s2, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
@@ -180,40 +174,38 @@ entry:
 define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: scaled_v8i16_i16_2gep:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vldrh.s32 q3, [r1]
-; CHECK-NEXT:    vmov.i32 q2, #0x28
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vshl.i32 q3, q3, #1
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vadd.i32 q2, q3, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrh.s32 q2, [r1]
+; CHECK-NEXT:    vmov.i32 q1, #0x28
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vadd.i32 q2, q2, q1
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r12, lr, d5
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vadd.i32 q1, q2, q1
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strh r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strh.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strh.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strh r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
index 85cc5eff3e799..41508444863bc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -69,35 +69,33 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: unscaled_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.s32 q2, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strh r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strh.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strh.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strh r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
   %offs.sext = sext <8 x i16> %offs to <8 x i32>
@@ -116,25 +114,21 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr,
 ; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vstr.16 s12, [r1]
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmov r1, r2, d5
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vstr.16 s8, [r1]
-; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vstr.16 s1, [r1]
+; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vstr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vstr.16 s3, [r0]
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vstr.16 s2, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s3, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
@@ -149,35 +143,33 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: unscaled_v8i16_noext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
 ; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strh r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strh.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strh.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strh r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
   %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
@@ -195,25 +187,21 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr,
 ; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vstr.16 s12, [r1]
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmov r1, r2, d5
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vstr.16 s8, [r1]
-; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vstr.16 s1, [r1]
+; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vstr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vstr.16 s3, [r0]
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vstr.16 s2, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s3, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
@@ -259,38 +247,36 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.s32 q5, [r1]
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrb.s32 q4, [r1]
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    vmov r2, r3, d8
+; CHECK-NEXT:    vmov r12, lr, d9
 ; CHECK-NEXT:    vldrb.s32 q4, [r1, #4]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vadd.i32 q5, q5, r0
 ; CHECK-NEXT:    vadd.i32 q4, q4, r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d8
+; CHECK-NEXT:    strh r4, [r2]
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r4, r5, d9
+; CHECK-NEXT:    strh r2, [r3]
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    strh.w r2, [r12]
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    strh.w r2, [lr]
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    strh r2, [r0]
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.sext = sext <8 x i8> %offs to <8 x i32>
@@ -341,35 +327,29 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.s32 q3, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q2, [r1]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r12, lr, d5
 ; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vadd.i32 q3, q3, r0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r6, d1
+; CHECK-NEXT:    strh r4, [r2]
+; CHECK-NEXT:    vmov r2, r7, d4
+; CHECK-NEXT:    strh r5, [r3]
+; CHECK-NEXT:    vmov r3, r5, d5
+; CHECK-NEXT:    strh.w r0, [r12]
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    strh.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r4, d3
+; CHECK-NEXT:    strh r0, [r2]
+; CHECK-NEXT:    strh r1, [r7]
+; CHECK-NEXT:    strh r6, [r3]
+; CHECK-NEXT:    strh r4, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.sext = sext <8 x i8> %offs to <8 x i32>
@@ -384,25 +364,23 @@ entry:
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.16 q2[1], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.16 q2[2], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.16 q2[3], r3
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov.16 q2[0], r4
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r3, r2, d2
+; CHECK-NEXT:    vldrb.u16 q1, [r1]
+; CHECK-NEXT:    vmov r1, r4, d1
+; CHECK-NEXT:    vmov.16 q2[1], r5
+; CHECK-NEXT:    vmov.16 q2[2], r1
+; CHECK-NEXT:    vmov.16 q2[3], r4
 ; CHECK-NEXT:    vmov.16 q2[4], r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov.16 q2[5], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov.16 q2[6], r3
-; CHECK-NEXT:    vldrb.u16 q0, [r1]
-; CHECK-NEXT:    vmov.16 q2[7], r2
-; CHECK-NEXT:    vstrh.16 q2, [r0, q0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.16 q2[5], r2
+; CHECK-NEXT:    vmov.16 q2[6], lr
+; CHECK-NEXT:    vmov.16 q2[7], r12
+; CHECK-NEXT:    vstrh.16 q2, [r0, q1]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.zext = zext <8 x i8> %offs to <8 x i32>
@@ -417,35 +395,33 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.s32 q2, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrb.s32 q1, [r1]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
 ; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strb.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strb.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strb r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strb r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.sext = sext <8 x i8> %offs to <8 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
index 3caa4b2cfabf2..f7f22abf4e3f8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
@@ -236,24 +236,22 @@ entry:
 define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
 ; CHECK-LABEL: ext_scaled_i16_i32_2gep:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vmov.i32 q1, #0xa
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    vshl.i32 q2, q2, #1
+; CHECK-NEXT:    vmov r4, r5, d1
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q2, q1
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vmov r2, lr, d3
 ; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    strh.w r3, [r12]
+; CHECK-NEXT:    strh r4, [r2]
+; CHECK-NEXT:    strh.w r5, [lr]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
   %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
index f4833b6a42212..37a1e934baef0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
@@ -361,21 +361,19 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrb.s32 q1, [r1]
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vmov r2, lr, d3
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    strb.w r3, [r12]
+; CHECK-NEXT:    strb r4, [r2]
+; CHECK-NEXT:    strb.w r5, [lr]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
   %offs.sext = sext <4 x i8> %offs to <4 x i32>
@@ -388,21 +386,19 @@ entry:
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrb.u32 q1, [r1]
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vmov r2, lr, d3
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    strb.w r3, [r12]
+; CHECK-NEXT:    strb r4, [r2]
+; CHECK-NEXT:    strb.w r5, [lr]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
   %offs.zext = zext <4 x i8> %offs to <4 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index 4840e4acf01bb..8fdb1a60eba51 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -20,35 +20,33 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
 ; CHECK-LABEL: unscaled_v8i8_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u32 q2, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrb.u32 q1, [r1]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
 ; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    strb.w r2, [r12]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    strb.w r2, [lr]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strb r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strb r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
   %offs.zext = zext <8 x i8> %offs to <8 x i32>
@@ -85,66 +83,57 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrb.s32 q4, [r1]
-; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
-; CHECK-NEXT:    vldrb.s32 q2, [r1, #8]
-; CHECK-NEXT:    vldrb.s32 q3, [r1, #4]
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    vldrb.s32 q1, [r1]
+; CHECK-NEXT:    vldrb.s32 q3, [r1, #8]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov.u8 r7, q0[6]
+; CHECK-NEXT:    vmov r12, lr, d3
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
+; CHECK-NEXT:    vmov r4, r8, d4
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r0, r9, d5
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r2, r10, d6
+; CHECK-NEXT:    strb.w r6, [r12]
+; CHECK-NEXT:    vmov.u8 r6, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    strb.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r1, d7
+; CHECK-NEXT:    strb r5, [r4]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    strb.w r5, [r8]
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    strb r7, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r9]
+; CHECK-NEXT:    vmov r0, r7, d3
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    strb.w r2, [r10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    strb r2, [r6]
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r5]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r7]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %offs.sext = sext <16 x i8> %offs to <16 x i32>
@@ -157,66 +146,57 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrh.s32 q4, [r1]
-; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
-; CHECK-NEXT:    vldrh.s32 q2, [r1, #16]
-; CHECK-NEXT:    vldrh.s32 q3, [r1, #8]
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vldrh.s32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov.u8 r7, q0[6]
+; CHECK-NEXT:    vmov r12, lr, d3
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
+; CHECK-NEXT:    vmov r4, r8, d4
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r0, r9, d5
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r2, r10, d6
+; CHECK-NEXT:    strb.w r6, [r12]
+; CHECK-NEXT:    vmov.u8 r6, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    strb.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r1, d7
+; CHECK-NEXT:    strb r5, [r4]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    strb.w r5, [r8]
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    strb r7, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r9]
+; CHECK-NEXT:    vmov r0, r7, d3
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    strb.w r2, [r10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    strb r2, [r6]
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r5]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r7]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
   %offs.sext = sext <16 x i16> %offs to <16 x i32>
@@ -229,70 +209,61 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_scaled:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrb.u32 q4, [r1]
-; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
-; CHECK-NEXT:    vldrb.u32 q2, [r1, #8]
-; CHECK-NEXT:    vldrb.u32 q3, [r1, #4]
-; CHECK-NEXT:    vshl.i32 q4, q4, #2
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    vldrb.u32 q1, [r1]
+; CHECK-NEXT:    vldrb.u32 q3, [r1, #8]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    vmov.u8 r7, q0[4]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #2
-; CHECK-NEXT:    vshl.i32 q2, q2, #2
 ; CHECK-NEXT:    vshl.i32 q3, q3, #2
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r12, lr, d3
+; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q1, q1, #2
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
+; CHECK-NEXT:    vmov r4, r8, d4
 ; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vshl.i32 q1, q1, #2
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r0, r9, d5
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r2, r10, d6
+; CHECK-NEXT:    strb.w r6, [r12]
+; CHECK-NEXT:    vmov.u8 r6, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    strb.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r5, d7
+; CHECK-NEXT:    strb r7, [r4]
+; CHECK-NEXT:    vmov.u8 r7, q0[5]
+; CHECK-NEXT:    strb.w r7, [r8]
+; CHECK-NEXT:    vmov r7, r4, d2
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r9]
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    strb.w r2, [r10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    strb r2, [r6]
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    strb r2, [r5]
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    strb r2, [r7]
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    strb r2, [r4]
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
   %offs.zext = zext <16 x i8> %offs to <16 x i32>
@@ -306,66 +277,57 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_i8_next:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q4, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
-; CHECK-NEXT:    vldrw.u32 q2, [r1, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r1, #16]
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vldrw.u32 q3, [r1, #32]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov.u8 r7, q0[6]
+; CHECK-NEXT:    vmov r12, lr, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
+; CHECK-NEXT:    vmov r4, r8, d4
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r0, r9, d5
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r2, r10, d6
+; CHECK-NEXT:    strb.w r6, [r12]
+; CHECK-NEXT:    vmov.u8 r6, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    strb.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r1, d7
+; CHECK-NEXT:    strb r5, [r4]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    strb.w r5, [r8]
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    strb r7, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r9]
+; CHECK-NEXT:    vmov r0, r7, d3
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    strb.w r2, [r10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    strb r2, [r6]
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r5]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r7]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
   %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
@@ -438,44 +400,38 @@ entry:
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.8 q4[0], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.8 q4[1], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.8 q4[2], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.8 q4[3], r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.8 q4[4], r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov.8 q4[5], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov.8 q4[6], r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov.8 q4[7], r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov.8 q4[8], r3
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov.8 q4[9], r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov.8 q4[10], r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    vmov.8 q4[11], r3
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov.8 q4[0], r4
+; CHECK-NEXT:    vmov lr, r12, d7
+; CHECK-NEXT:    vmov r3, r2, d6
+; CHECK-NEXT:    vldrb.u8 q3, [r1]
+; CHECK-NEXT:    vmov r1, r4, d1
+; CHECK-NEXT:    vmov.8 q4[1], r5
+; CHECK-NEXT:    vmov.8 q4[2], r1
+; CHECK-NEXT:    vmov r1, r5, d2
+; CHECK-NEXT:    vmov.8 q4[3], r4
+; CHECK-NEXT:    vmov.8 q4[4], r1
+; CHECK-NEXT:    vmov r1, r4, d3
+; CHECK-NEXT:    vmov.8 q4[5], r5
+; CHECK-NEXT:    vmov.8 q4[6], r1
+; CHECK-NEXT:    vmov r1, r5, d4
+; CHECK-NEXT:    vmov.8 q4[7], r4
+; CHECK-NEXT:    vmov.8 q4[8], r1
+; CHECK-NEXT:    vmov r1, r4, d5
+; CHECK-NEXT:    vmov.8 q4[9], r5
+; CHECK-NEXT:    vmov.8 q4[10], r1
+; CHECK-NEXT:    vmov.8 q4[11], r4
 ; CHECK-NEXT:    vmov.8 q4[12], r3
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov.8 q4[13], r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.8 q4[14], r3
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.8 q4[15], r2
-; CHECK-NEXT:    vstrb.8 q4, [r0, q0]
+; CHECK-NEXT:    vmov.8 q4[13], r2
+; CHECK-NEXT:    vmov.8 q4[14], lr
+; CHECK-NEXT:    vmov.8 q4[15], r12
+; CHECK-NEXT:    vstrb.8 q4, [r0, q3]
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %offs.zext = zext <16 x i8> %offs to <16 x i32>
@@ -536,71 +492,65 @@ entry:
 define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_i8_2gep:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
-; CHECK-NEXT:    vldrb.s32 q2, [r1, #8]
-; CHECK-NEXT:    vldrb.s32 q3, [r1, #4]
-; CHECK-NEXT:    vldrb.s32 q5, [r1]
-; CHECK-NEXT:    vmov.i32 q4, #0x5
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrb.s32 q2, [r1]
+; CHECK-NEXT:    vmov.i32 q1, #0x5
+; CHECK-NEXT:    vldrb.s32 q4, [r1, #8]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    vadd.i32 q5, q5, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, q4
-; CHECK-NEXT:    vadd.i32 q2, q2, q4
-; CHECK-NEXT:    vadd.i32 q3, q3, q4
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    vadd.i32 q2, q2, q1
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov.u8 r7, q0[6]
+; CHECK-NEXT:    vmov r12, lr, d5
+; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vadd.i32 q3, q2, q1
+; CHECK-NEXT:    vldrb.s32 q2, [r1, #12]
+; CHECK-NEXT:    vmov r4, r8, d6
+; CHECK-NEXT:    vadd.i32 q2, q2, r0
+; CHECK-NEXT:    vmov r0, r9, d7
+; CHECK-NEXT:    vadd.i32 q3, q4, q1
+; CHECK-NEXT:    vadd.i32 q1, q2, q1
+; CHECK-NEXT:    strb r6, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    strb r2, [r3]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r2, r10, d6
+; CHECK-NEXT:    strb.w r6, [r12]
+; CHECK-NEXT:    vmov.u8 r6, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    strb.w r6, [lr]
+; CHECK-NEXT:    vmov r6, r1, d7
+; CHECK-NEXT:    strb r5, [r4]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    strb.w r5, [r8]
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    strb r7, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r9]
+; CHECK-NEXT:    vmov r0, r7, d3
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    strb.w r2, [r10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    strb r2, [r6]
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r5]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r7]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
   %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
index b3529089ef24e..7847c4d3ac806 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -36,33 +36,27 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, <8 x i32*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r5, d1
+; CHECK-NEXT:    str r3, [r1]
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    str r4, [r2]
+; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    str.w r0, [lr]
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    str.w r5, [r12]
+; CHECK-NEXT:    vmov r5, r6, d3
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    str r3, [r7]
+; CHECK-NEXT:    str r5, [r2]
+; CHECK-NEXT:    str r6, [r4]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
   call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %v, <8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
@@ -73,62 +67,51 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, <16 x i32*>* %offptr) {
 ; CHECK-LABEL: ptr_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q7, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s29
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s31
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s27
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r1, r2, d8
+; CHECK-NEXT:    vmov lr, r12, d9
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov r0, r5, d1
+; CHECK-NEXT:    str r3, [r1]
+; CHECK-NEXT:    vmov r1, r3, d12
+; CHECK-NEXT:    str r4, [r2]
+; CHECK-NEXT:    vmov r2, r7, d13
+; CHECK-NEXT:    str.w r0, [lr]
+; CHECK-NEXT:    vmov r0, r4, d2
+; CHECK-NEXT:    str.w r5, [r12]
+; CHECK-NEXT:    vmov r5, r6, d3
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    str r4, [r3]
+; CHECK-NEXT:    vmov r3, r4, d11
+; CHECK-NEXT:    str r5, [r2]
+; CHECK-NEXT:    vmov r2, r5, d4
+; CHECK-NEXT:    str r6, [r7]
+; CHECK-NEXT:    vmov r7, r6, d5
+; CHECK-NEXT:    str r2, [r0]
+; CHECK-NEXT:    vmov r0, r2, d8
+; CHECK-NEXT:    str r5, [r1]
+; CHECK-NEXT:    vmov r1, r5, d9
+; CHECK-NEXT:    str r7, [r3]
+; CHECK-NEXT:    vmov r3, r7, d6
+; CHECK-NEXT:    str r6, [r4]
+; CHECK-NEXT:    vmov r6, r4, d7
+; CHECK-NEXT:    str r3, [r0]
+; CHECK-NEXT:    str r7, [r2]
+; CHECK-NEXT:    str r6, [r1]
+; CHECK-NEXT:    str r4, [r5]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
   call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %v, <16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
@@ -170,24 +153,20 @@ define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, <8 x float*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov lr, s10
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    vstr s0, [r5]
-; CHECK-NEXT:    vstr s1, [r4]
-; CHECK-NEXT:    vstr s2, [r2]
-; CHECK-NEXT:    vstr s3, [r0]
-; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    vstr s5, [r3]
-; CHECK-NEXT:    vstr s6, [lr]
-; CHECK-NEXT:    vstr s7, [r12]
+; CHECK-NEXT:    vmov r1, lr, d4
+; CHECK-NEXT:    vmov r3, r12, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov r0, r2, d4
+; CHECK-NEXT:    vmov r4, r5, d5
+; CHECK-NEXT:    vstr s0, [r1]
+; CHECK-NEXT:    vstr s1, [lr]
+; CHECK-NEXT:    vstr s2, [r3]
+; CHECK-NEXT:    vstr s3, [r12]
+; CHECK-NEXT:    vstr s4, [r0]
+; CHECK-NEXT:    vstr s5, [r2]
+; CHECK-NEXT:    vstr s6, [r4]
+; CHECK-NEXT:    vstr s7, [r5]
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x float*>, <8 x float*>* %offptr, align 4
@@ -201,33 +180,31 @@ entry:
 define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, <8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r12, d3
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov r0, lr, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strh r6, [r1]
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    strh r1, [r2]
 ; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    strh r1, [r3]
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    strh.w r1, [r12]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strh.w r0, [lr]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strh r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strh r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v, <8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
@@ -255,20 +232,18 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_trunc:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vmov r2, lr, d3
 ; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    strh.w r3, [r12]
+; CHECK-NEXT:    strh r4, [r2]
+; CHECK-NEXT:    strh.w r5, [lr]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
   %ext = trunc <4 x i32> %v to <4 x i16>
@@ -280,33 +255,27 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i16_trunc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r5, d1
+; CHECK-NEXT:    strh r3, [r1]
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    strh r4, [r2]
+; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    strh.w r0, [lr]
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    strh.w r5, [r12]
+; CHECK-NEXT:    vmov r5, r6, d3
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    strh r3, [r7]
+; CHECK-NEXT:    strh r5, [r2]
+; CHECK-NEXT:    strh r6, [r4]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
   %ext = trunc <8 x i32> %v to <8 x i16>
@@ -323,25 +292,21 @@ define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vstr.16 s12, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vstr.16 s1, [r0]
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vstr.16 s12, [r1]
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vstr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vstr.16 s1, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vstr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vstr.16 s2, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x half*>, <8 x half*>* %offptr, align 4
@@ -355,62 +320,53 @@ entry:
 define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, <16 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    vmov r3, r12, d3
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov lr, r4, d4
+; CHECK-NEXT:    vmov.u8 r7, q0[6]
+; CHECK-NEXT:    vmov r0, r8, d5
+; CHECK-NEXT:    strb r6, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    strb r1, [r2]
+; CHECK-NEXT:    vmov.u8 r6, q0[2]
+; CHECK-NEXT:    vmov r1, r9, d6
+; CHECK-NEXT:    strb r6, [r3]
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    strb.w r3, [r12]
+; CHECK-NEXT:    vmov r3, r6, d7
+; CHECK-NEXT:    strb.w r5, [lr]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    strb r5, [r4]
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    strb r7, [r0]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    strb.w r0, [r8]
+; CHECK-NEXT:    vmov r0, r7, d3
+; CHECK-NEXT:    strb r2, [r1]
 ; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    strb.w r1, [r9]
 ; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    strb r1, [r3]
 ; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    strb r1, [r6]
 ; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    strb r1, [r5]
 ; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    strb r1, [r4]
 ; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    strb r0, [r7]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4
   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v, <16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
@@ -421,33 +377,31 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, <8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_trunc16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r12, d3
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov r0, lr, d2
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    strb r6, [r1]
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    strb r1, [r2]
 ; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    strb r1, [r3]
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    strb.w r1, [r12]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    strb.w r0, [lr]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    strb r0, [r4]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    strb r0, [r5]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %ext = trunc <8 x i16> %v to <8 x i8>
@@ -459,20 +413,18 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i8_trunc32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov r0, r12, d2
+; CHECK-NEXT:    vmov r2, lr, d3
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    strb.w r3, [r12]
+; CHECK-NEXT:    strb r4, [r2]
+; CHECK-NEXT:    strb.w r5, [lr]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
   %ext = trunc <4 x i32> %v to <4 x i8>
@@ -484,33 +436,27 @@ entry:
 define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, <8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_trunc32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    strb r1, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r0, r5, d1
+; CHECK-NEXT:    strb r3, [r1]
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    strb r4, [r2]
+; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    strb.w r0, [lr]
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    strb.w r5, [r12]
+; CHECK-NEXT:    vmov r5, r6, d3
+; CHECK-NEXT:    strb r0, [r1]
+; CHECK-NEXT:    strb r3, [r7]
+; CHECK-NEXT:    strb r5, [r2]
+; CHECK-NEXT:    strb r6, [r4]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
   %ext = trunc <8 x i32> %v to <8 x i8>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll
index 4bec492b5ac95..59e66f7abe567 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -78,15 +78,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i35(<2 x i64> %m) {
 ; CHECK-LABEL: sext_v2i64_v2i64_v2i35:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbfx r0, r0, #0, #3
-; CHECK-NEXT:    sbfx r1, r1, #0, #3
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    sbfx r0, r1, #0, #3
+; CHECK-NEXT:    sbfx r1, r3, #0, #3
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %shl = shl <2 x i64> %m, <i64 29, i64 29>
@@ -480,22 +477,18 @@ define arm_aapcs_vfpcc <8 x i16> @trunc_v8i32_v8i16(<8 x i32> %src) {
 ; CHECK-LABEL: trunc_v8i32_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = trunc <8 x i32> %src to <8 x i16>
@@ -508,38 +501,30 @@ define arm_aapcs_vfpcc <16 x i8> @trunc_v16i32_v16i8(<16 x i32> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    vmov.8 q0[0], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.8 q0[1], r0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.8 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    vmov.8 q0[2], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.8 q0[3], r1
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.8 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.8 q0[7], r1
+; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov.8 q0[8], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.8 q0[9], r1
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    vmov.8 q0[10], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.8 q0[11], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.8 q0[11], r1
+; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vmov.8 q0[12], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.8 q0[13], r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.8 q0[13], r1
+; CHECK-NEXT:    vmov r0, r1, d7
 ; CHECK-NEXT:    vmov.8 q0[14], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.8 q0[15], r1
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
index 6b095a39625fb..19d51938a54ed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
@@ -34,17 +34,17 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shl_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, r1, d1
 ; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov r12, s4
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsll r0, r3, r12
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, r5, d0
+; CHECK-NEXT:    lsll r0, r5, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
+; CHECK-NEXT:    pop {r5, pc}
 entry:
   %0 = shl <2 x i64> %src1, %src2
   ret <2 x i64> %0
@@ -91,12 +91,10 @@ define arm_aapcs_vfpcc <2 x i64> @shru_qq_int64_t(<2 x i64> %src1, <2 x i64> %sr
 ; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r5, d1
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    lsll r0, r5, r2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    lsll r2, r3, r1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
@@ -144,17 +142,17 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shrs_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, r1, d1
 ; CHECK-NEXT:    asrl r2, r1, r0
-; CHECK-NEXT:    vmov r12, s4
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    asrl r0, r3, r12
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, r5, d0
+; CHECK-NEXT:    asrl r0, r5, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
+; CHECK-NEXT:    pop {r5, pc}
 entry:
   %0 = ashr <2 x i64> %src1, %src2
   ret <2 x i64> %0
@@ -194,11 +192,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shl_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    lsll r0, r1, #4
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    lsll r2, r3, #4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
@@ -242,11 +238,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shru_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    lsrl r0, r1, #4
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    lsrl r2, r3, #4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
@@ -290,11 +284,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shrs_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    asrl r0, r1, #4
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    asrl r2, r3, #4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
@@ -344,11 +336,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shl_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r12, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    lsll r12, r1, r0
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    lsll r2, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
@@ -403,16 +393,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shru_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    rsb.w r12, r0, #0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    lsll r2, r1, r12
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsll r0, r3, r12
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
+; CHECK-NEXT:    rsbs r3, r0, #0
+; CHECK-NEXT:    vmov r2, r1, d1
+; CHECK-NEXT:    vmov r0, r5, d0
+; CHECK-NEXT:    lsll r2, r1, r3
+; CHECK-NEXT:    lsll r0, r5, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
+; CHECK-NEXT:    pop {r5, pc}
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
   %s = shufflevector <2 x i64> %i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -463,11 +453,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shrs_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r12, s2
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    asrl r12, r1, r0
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    asrl r2, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 415ce651b5caa..46c3cebb76860 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -1678,8 +1678,7 @@ entry:
 define arm_aapcs_vfpcc i64 @extract_i64_0(<2 x i64> %a) {
 ; CHECK-LABEL: extract_i64_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <2 x i64> %a, i32 0
@@ -1689,8 +1688,7 @@ entry:
 define arm_aapcs_vfpcc i64 @extract_i64_1(<2 x i64> %a) {
 ; CHECK-LABEL: extract_i64_1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <2 x i64> %a, i32 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index 484a431a1ae1b..a71adb8a655d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -35,23 +35,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: add_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r1, r0, d2
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds r1, r1, r4
+; CHECK-NEXT:    adcs r0, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r3
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = add nsw <2 x i64> %src1, %src2
   ret <2 x i64> %0
@@ -172,23 +168,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: sub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    subs.w lr, r3, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    sbc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    subs r0, r2, r0
-; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    vmov r1, r0, d0
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    subs.w r2, r2, lr
+; CHECK-NEXT:    sbc.w r3, r3, r12
+; CHECK-NEXT:    subs r1, r4, r1
+; CHECK-NEXT:    sbc.w r0, r5, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r3
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = sub nsw <2 x i64> %src2, %src1
   ret <2 x i64> %0
@@ -309,25 +301,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: mul_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    umull r12, r3, r1, r0
-; CHECK-NEXT:    mla lr, r1, r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    umull r4, r5, r1, r3
-; CHECK-NEXT:    mla r1, r1, r2, r5
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    mla r0, r2, r0, lr
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r12
-; CHECK-NEXT:    mla r1, r2, r3, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r2, lr, d0
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    umull r12, r3, r2, r0
+; CHECK-NEXT:    mla r1, r2, r1, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    mla r0, lr, r0, r1
+; CHECK-NEXT:    umull r6, r7, r2, r4
+; CHECK-NEXT:    mla r2, r2, r5, r7
+; CHECK-NEXT:    vmov q0[2], q0[0], r12, r6
+; CHECK-NEXT:    mla r2, r3, r4, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %0 = mul nsw <2 x i64> %src1, %src2
   ret <2 x i64> %0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index 01d06fd90b648..bceb1f85ef9b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -91,21 +91,19 @@ entry:
 }
 
 define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK-LE-LABEL: vector_add_i64:
-; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r7, lr}
-; CHECK-LE-NEXT:    push {r7, lr}
-; CHECK-LE-NEXT:    add.w r12, sp, #8
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r12]
-; CHECK-LE-NEXT:    vmov lr, s0
-; CHECK-LE-NEXT:    vmov r12, s1
-; CHECK-LE-NEXT:    adds.w r0, r0, lr
-; CHECK-LE-NEXT:    vmov lr, s2
-; CHECK-LE-NEXT:    adc.w r1, r1, r12
-; CHECK-LE-NEXT:    vmov r12, s3
-; CHECK-LE-NEXT:    adds.w r2, r2, lr
-; CHECK-LE-NEXT:    adc.w r3, r3, r12
-; CHECK-LE-NEXT:    pop {r7, pc}
+; CHECK-MVE-LABEL: vector_add_i64:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .save {r7, lr}
+; CHECK-MVE-NEXT:    push {r7, lr}
+; CHECK-MVE-NEXT:    add.w r12, sp, #8
+; CHECK-MVE-NEXT:    vldrw.u32 q0, [r12]
+; CHECK-MVE-NEXT:    vmov r12, lr, d0
+; CHECK-MVE-NEXT:    adds.w r0, r0, r12
+; CHECK-MVE-NEXT:    adc.w r1, r1, lr
+; CHECK-MVE-NEXT:    vmov r12, lr, d1
+; CHECK-MVE-NEXT:    adds.w r2, r2, r12
+; CHECK-MVE-NEXT:    adc.w r3, r3, lr
+; CHECK-MVE-NEXT:    pop {r7, pc}
 ;
 ; CHECK-BE-LABEL: vector_add_i64:
 ; CHECK-BE:       @ %bb.0: @ %entry
@@ -113,15 +111,27 @@ define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    add.w r12, sp, #8
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r12]
-; CHECK-BE-NEXT:    vmov lr, s1
-; CHECK-BE-NEXT:    vmov r12, s0
+; CHECK-BE-NEXT:    vmov r12, lr, d0
 ; CHECK-BE-NEXT:    adds.w r1, r1, lr
-; CHECK-BE-NEXT:    vmov lr, s3
 ; CHECK-BE-NEXT:    adc.w r0, r0, r12
-; CHECK-BE-NEXT:    vmov r12, s2
+; CHECK-BE-NEXT:    vmov r12, lr, d1
 ; CHECK-BE-NEXT:    adds.w r3, r3, lr
 ; CHECK-BE-NEXT:    adc.w r2, r2, r12
 ; CHECK-BE-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: vector_add_i64:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-FP-NEXT:    push {r4, r5, r7, lr}
+; CHECK-FP-NEXT:    add.w r12, sp, #16
+; CHECK-FP-NEXT:    vldrw.u32 q0, [r12]
+; CHECK-FP-NEXT:    vmov r12, lr, d0
+; CHECK-FP-NEXT:    vmov r4, r5, d1
+; CHECK-FP-NEXT:    adds.w r0, r0, r12
+; CHECK-FP-NEXT:    adc.w r1, r1, lr
+; CHECK-FP-NEXT:    adds r2, r2, r4
+; CHECK-FP-NEXT:    adcs r3, r5
+; CHECK-FP-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sum = add <2 x i64> %lhs, %rhs
   ret <2 x i64> %sum
@@ -338,67 +348,67 @@ entry:
 define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-LABEL: vector_add_f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .save {r7, lr}
-; CHECK-MVE-NEXT:    push {r7, lr}
+; CHECK-MVE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-MVE-NEXT:    vmov d11, r2, r3
-; CHECK-MVE-NEXT:    vmov d10, r0, r1
-; CHECK-MVE-NEXT:    add r1, sp, #56
-; CHECK-MVE-NEXT:    vldrw.u32 q6, [r1]
-; CHECK-MVE-NEXT:    vmov r0, s23
-; CHECK-MVE-NEXT:    vmov r1, s27
+; CHECK-MVE-NEXT:    vmov d13, r2, r3
+; CHECK-MVE-NEXT:    vmov d12, r0, r1
+; CHECK-MVE-NEXT:    add r1, sp, #64
+; CHECK-MVE-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-MVE-NEXT:    vmov r4, r0, d13
+; CHECK-MVE-NEXT:    vmov r5, r1, d11
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s19, r0
-; CHECK-MVE-NEXT:    vmov r0, s22
-; CHECK-MVE-NEXT:    vmov r1, s26
+; CHECK-MVE-NEXT:    mov r0, r4
+; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s18, r0
-; CHECK-MVE-NEXT:    vmov r0, s21
-; CHECK-MVE-NEXT:    vmov r1, s25
+; CHECK-MVE-NEXT:    vmov r4, r0, d12
+; CHECK-MVE-NEXT:    vmov r5, r1, d10
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s17, r0
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmov r1, s24
+; CHECK-MVE-NEXT:    mov r0, r4
+; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s16, r0
 ; CHECK-MVE-NEXT:    vmov r2, r3, d9
 ; CHECK-MVE-NEXT:    vmov r0, r1, d8
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-MVE-NEXT:    pop {r7, pc}
+; CHECK-MVE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-BE-LABEL: vector_add_f32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    .save {r7, lr}
-; CHECK-BE-NEXT:    push {r7, lr}
+; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
-; CHECK-BE-NEXT:    add r1, sp, #56
+; CHECK-BE-NEXT:    add r1, sp, #64
 ; CHECK-BE-NEXT:    vldrw.u32 q6, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q5, q0
-; CHECK-BE-NEXT:    vmov r0, s23
-; CHECK-BE-NEXT:    vmov r1, s27
+; CHECK-BE-NEXT:    vmov r4, r0, d11
+; CHECK-BE-NEXT:    vmov r5, r1, d13
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s19, r0
-; CHECK-BE-NEXT:    vmov r0, s22
-; CHECK-BE-NEXT:    vmov r1, s26
+; CHECK-BE-NEXT:    mov r0, r4
+; CHECK-BE-NEXT:    mov r1, r5
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s18, r0
-; CHECK-BE-NEXT:    vmov r0, s21
-; CHECK-BE-NEXT:    vmov r1, s25
+; CHECK-BE-NEXT:    vmov r4, r0, d10
+; CHECK-BE-NEXT:    vmov r5, r1, d12
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s17, r0
-; CHECK-BE-NEXT:    vmov r0, s20
-; CHECK-BE-NEXT:    vmov r1, s24
+; CHECK-BE-NEXT:    mov r0, r4
+; CHECK-BE-NEXT:    mov r1, r5
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s16, r0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q4
 ; CHECK-BE-NEXT:    vmov r1, r0, d0
 ; CHECK-BE-NEXT:    vmov r3, r2, d1
 ; CHECK-BE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    pop {r7, pc}
+; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:
 ; CHECK-FP:       @ %bb.0: @ %entry

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll
index d3f93d6c394f1..e18ca2d82118b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll
@@ -5,42 +5,42 @@
 define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, <4 x float>* %z) {
 ; CHECK-MVE-LABEL: vabd_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-MVE-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-MVE-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-MVE-NEXT:    .pad #4
 ; CHECK-MVE-NEXT:    sub sp, #4
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmov q4, q1
 ; CHECK-MVE-NEXT:    vmov q5, q0
-; CHECK-MVE-NEXT:    mov r4, r0
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    mov r8, r0
+; CHECK-MVE-NEXT:    vmov r0, r6, d10
+; CHECK-MVE-NEXT:    vmov r1, r7, d8
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
-; CHECK-MVE-NEXT:    mov r5, r0
-; CHECK-MVE-NEXT:    vmov r0, s21
-; CHECK-MVE-NEXT:    vmov r1, s17
+; CHECK-MVE-NEXT:    mov r9, r0
+; CHECK-MVE-NEXT:    mov r0, r6
+; CHECK-MVE-NEXT:    mov r1, r7
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    mov r6, r0
-; CHECK-MVE-NEXT:    vmov r0, s22
-; CHECK-MVE-NEXT:    vmov r1, s18
+; CHECK-MVE-NEXT:    vmov r0, r7, d11
+; CHECK-MVE-NEXT:    vmov r1, r4, d9
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
-; CHECK-MVE-NEXT:    mov r7, r0
-; CHECK-MVE-NEXT:    vmov r0, s23
-; CHECK-MVE-NEXT:    vmov r1, s19
+; CHECK-MVE-NEXT:    mov r5, r0
+; CHECK-MVE-NEXT:    mov r0, r7
+; CHECK-MVE-NEXT:    mov r1, r4
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
 ; CHECK-MVE-NEXT:    vmov s3, r0
-; CHECK-MVE-NEXT:    bic r0, r7, #-2147483648
+; CHECK-MVE-NEXT:    bic r0, r5, #-2147483648
 ; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    bic r0, r6, #-2147483648
 ; CHECK-MVE-NEXT:    vmov s1, r0
-; CHECK-MVE-NEXT:    bic r0, r5, #-2147483648
+; CHECK-MVE-NEXT:    bic r0, r9, #-2147483648
 ; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vstrw.32 q0, [r4]
+; CHECK-MVE-NEXT:    vstrw.32 q0, [r8]
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    add sp, #4
-; CHECK-MVE-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-MVE-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ;
 ; CHECK-MVEFP-LABEL: vabd_v4f32:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 55ec20bb60974..4e8d880d1a306 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -159,43 +159,35 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vand q2, q2, q4
 ; CHECK-NEXT:    vand q3, q3, q4
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s13
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d6
 ; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vand q4, q0, q4
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    sbc.w r0, r1, r0
-; CHECK-NEXT:    add.w r1, r2, r0, asr #31
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    sbc.w r1, r2, r1
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbc.w r1, r3, r1
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
-; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s15
+; CHECK-NEXT:    eor.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r0, d8
+; CHECK-NEXT:    subs r1, r3, r1
+; CHECK-NEXT:    sbcs r0, r2
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    add.w r1, r1, r0, asr #31
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    sbc.w r0, r1, r0
-; CHECK-NEXT:    add.w r1, r2, r0, asr #31
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    sbc.w r1, r2, r1
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbc.w r1, r3, r1
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
-; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
+; CHECK-NEXT:    eor.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov r1, r2, d3
+; CHECK-NEXT:    vmov r3, r0, d9
+; CHECK-NEXT:    subs r1, r3, r1
+; CHECK-NEXT:    sbcs r0, r2
+; CHECK-NEXT:    add.w r1, r1, r0, asr #31
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -548,57 +540,49 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov.f32 s14, s9
 ; CHECK-NEXT:    vand q4, q3, q0
 ; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov r3, r4, d8
 ; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    vand q5, q5, q0
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov r6, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    vmov r7, s23
-; CHECK-NEXT:    subs.w r8, r6, r3
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    sbc.w r4, r5, r4
-; CHECK-NEXT:    vmov r6, s19
+; CHECK-NEXT:    vmov r5, r6, d10
+; CHECK-NEXT:    subs.w r8, r5, r3
+; CHECK-NEXT:    vmov r7, r3, d11
+; CHECK-NEXT:    sbc.w r4, r6, r4
 ; CHECK-NEXT:    asrs r5, r4, #31
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r5, #0, #4
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, r6, d9
 ; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vand q2, q4, q0
 ; CHECK-NEXT:    vmov.f32 s16, s14
 ; CHECK-NEXT:    vmov.f32 s18, s15
 ; CHECK-NEXT:    vand q3, q4, q0
-; CHECK-NEXT:    vmov r12, s12
-; CHECK-NEXT:    subs.w r9, r3, r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    sbc.w r3, r7, r6
-; CHECK-NEXT:    movs r7, #1
-; CHECK-NEXT:    vmov r6, s15
+; CHECK-NEXT:    subs.w r9, r7, r5
+; CHECK-NEXT:    mov.w r7, #1
+; CHECK-NEXT:    sbcs r3, r6
 ; CHECK-NEXT:    and.w r3, r7, r3, asr #31
-; CHECK-NEXT:    vmov r7, s10
+; CHECK-NEXT:    vmov r7, r5, d7
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    bfi r4, r3, #4, #4
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    subs.w r10, r5, r7
-; CHECK-NEXT:    vmov r7, s9
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    sbc.w r3, r6, r3
-; CHECK-NEXT:    vmov r6, s8
-; CHECK-NEXT:    asr.w r11, r3, #31
-; CHECK-NEXT:    subs.w r6, r12, r6
-; CHECK-NEXT:    sbc.w r7, r5, r7
-; CHECK-NEXT:    asrs r7, r7, #31
-; CHECK-NEXT:    vmov q2[2], q2[0], r7, r11
-; CHECK-NEXT:    vmov r7, s8
+; CHECK-NEXT:    vmov r3, r6, d5
+; CHECK-NEXT:    subs.w r10, r7, r3
+; CHECK-NEXT:    vmov r7, r3, d4
+; CHECK-NEXT:    sbcs r5, r6
+; CHECK-NEXT:    vmov r6, r12, d6
+; CHECK-NEXT:    asr.w r11, r5, #31
+; CHECK-NEXT:    subs r6, r6, r7
+; CHECK-NEXT:    sbc.w r3, r12, r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r11
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov q2[2], q2[0], r8, r6
 ; CHECK-NEXT:    vmov q2[3], q2[1], r9, r10
-; CHECK-NEXT:    and r7, r7, #1
-; CHECK-NEXT:    rsbs r7, r7, #0
-; CHECK-NEXT:    bfi r4, r7, #8, #4
-; CHECK-NEXT:    movs r7, #1
-; CHECK-NEXT:    and.w r3, r7, r3, asr #31
+; CHECK-NEXT:    and r3, r3, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r4, r3, #8, #4
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    and.w r3, r3, r5, asr #31
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    bfi r4, r3, #12, #4
 ; CHECK-NEXT:    vmsr p0, r4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
index d4a04567f099f..9a9b58a635a9e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
@@ -12,12 +12,10 @@ declare i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8>)
 define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) {
 ; CHECK-LABEL: vaddv_v2i64_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %r = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
@@ -92,14 +90,12 @@ define arm_aapcs_vfpcc i64 @vaddva_v2i64_i64(<2 x i64> %s1, i64 %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
index 03bd26af05f44..34fc2bbb86f36 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
@@ -367,22 +367,18 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r12, r2, d2
+; CHECK-NEXT:    vmov r3, r1, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, r3, r12
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
@@ -402,22 +398,18 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r12, r2, d2
+; CHECK-NEXT:    vmov r3, r1, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, r3, r12
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
@@ -441,12 +433,10 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
index aaf97a6503a26..7d4fb98fc148a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
@@ -433,18 +433,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r2, r1
-; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r3, r1
+; CHECK-NEXT:    eors r2, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
 ; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    eor.w r0, r0, r12
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
@@ -466,18 +464,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r2, r1
-; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r3, r1
+; CHECK-NEXT:    eors r2, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
 ; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    eor.w r0, r0, r12
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
@@ -503,12 +499,10 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -1014,18 +1008,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_r_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r2, r1
-; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r3, r1
+; CHECK-NEXT:    eors r2, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
 ; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    eor.w r0, r0, r12
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
@@ -1047,18 +1039,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_r_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r2, r1
-; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    eors r3, r1
+; CHECK-NEXT:    eors r2, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r12, r3, d0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
 ; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    eor.w r0, r0, r12
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
@@ -1084,12 +1074,10 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b,
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
index 1e8fbb3700b52..b1da62e1aec29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
@@ -361,11 +361,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eqz_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -388,11 +386,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eqz_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -775,11 +771,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_r_eqz_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -802,11 +796,9 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_r_eqz_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 42a0fbc56c357..9b1175fabce3b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -266,11 +266,9 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_int64(<2 x i64> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
@@ -292,11 +290,9 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_uint64(<2 x i64> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_ul2d
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index c23c81bbb0ca5..91255bbd26cdc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -38,12 +38,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
@@ -54,14 +52,16 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
 ; CHECK-LABEL: add_v2i32_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    adc.w r1, r3, r1, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
@@ -134,40 +134,36 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
@@ -181,46 +177,49 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ; CHECK-NEXT:    vmov.s16 r0, q0[1]
 ; CHECK-NEXT:    vmov.s16 r1, q0[0]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s16 r2, q0[3]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s16 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov.s16 r1, q0[3]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s16 r2, q0[5]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s16 r3, q0[4]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s16 r1, q0[5]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.s16 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.s16 r3, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r0, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.s16 r3, q0[7]
-; CHECK-NEXT:    adc.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s16 r1, q0[6]
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    adc.w r1, r0, r1, asr #31
-; CHECK-NEXT:    adds r0, r2, r3
-; CHECK-NEXT:    adc.w r1, r1, r3, asr #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov r0, r3, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -257,10 +256,9 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
@@ -271,13 +269,18 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    sxth r1, r0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    adc.w r1, r3, r1, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
@@ -442,88 +445,76 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -537,98 +528,97 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK-NEXT:    vmov.s8 r0, q0[1]
 ; CHECK-NEXT:    vmov.s8 r1, q0[0]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[3]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[3]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[5]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[4]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[5]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[6]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[7]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[9]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[8]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[9]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[11]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[10]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[11]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[13]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[12]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[13]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.s8 r3, q0[15]
-; CHECK-NEXT:    adc.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov.s8 r1, q0[14]
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    adc.w r1, r0, r1, asr #31
-; CHECK-NEXT:    adds r0, r2, r3
-; CHECK-NEXT:    adc.w r1, r1, r3, asr #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[15]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r0, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov r0, r3, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -644,41 +634,37 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -694,52 +680,55 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    adds r2, r2, r1
-; CHECK-NEXT:    adc.w r1, r0, r1, asr #31
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    sxtb r3, r0
-; CHECK-NEXT:    adds r0, r2, r3
-; CHECK-NEXT:    adc.w r1, r1, r3, asr #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r0, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov r0, r3, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
@@ -778,10 +767,9 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
@@ -792,13 +780,18 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    sxtb r1, r0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    adc.w r1, r3, r1, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
@@ -809,12 +802,10 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
@@ -863,14 +854,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
@@ -882,17 +871,21 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
 ; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov r12, s4
-; CHECK-NEXT:    adds.w r12, r12, r3
-; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
-; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r2, lr, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -965,50 +958,46 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.u16 r2, q0[1]
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, r12, d5
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add.w r12, r3, r2
+; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    add r2, lr
+; CHECK-NEXT:    add.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[5]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add r12, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds.w r4, r12, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add lr, r2
+; CHECK-NEXT:    vmov r3, r2, d5
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    adc.w r12, r2, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1024,48 +1013,51 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    vmov.s16 r2, q0[1]
 ; CHECK-NEXT:    vmov.s16 r3, q0[0]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r12
-; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s5
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r3, r2, d2
 ; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.s16 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.s16 r2, q0[3]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
 ; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.s16 r4, q0[5]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.s16 r2, q0[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s16 r2, q0[5]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s16 r3, q0[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s16 r2, q0[7]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s16 r3, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adc.w r4, r4, r12
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s16 r4, q0[6]
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    vmov.s16 r4, q0[7]
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r2, r4, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -1105,12 +1097,11 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, r12, d1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r1, r1, r12
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
@@ -1122,16 +1113,23 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
 ; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r2, lr, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1302,98 +1300,86 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.u8 r2, q0[1]
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, r12, d5
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add.w r12, r3, r2
+; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    add r2, lr
+; CHECK-NEXT:    add.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
 ; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add r12, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds.w r4, r12, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add lr, r2
+; CHECK-NEXT:    vmov r3, r2, d5
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    adc.w r12, r2, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    adc.w r3, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds.w r2, r2, lr
 ; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1409,100 +1395,99 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    vmov.s8 r2, q0[1]
 ; CHECK-NEXT:    vmov.s8 r3, q0[0]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r12
-; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s5
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r3, r2, d2
 ; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.s8 r2, q0[3]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[5]
 ; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.s8 r4, q0[5]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.s8 r2, q0[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    vmov.s8 r3, q0[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q0[6]
-; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[7]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[7]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[6]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q0[8]
-; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[9]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[9]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[8]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q0[10]
-; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[11]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[11]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[10]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q0[12]
-; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[13]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[13]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[12]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.s8 r2, q0[15]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adc.w r4, r4, r12
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[14]
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    vmov.s8 r4, q0[15]
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r2, r4, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -1514,51 +1499,47 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vmov.u16 r2, q0[1]
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, r12, d5
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add.w r12, r3, r2
+; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    add r2, lr
+; CHECK-NEXT:    add.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[5]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add r12, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds.w r4, r12, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    add lr, r2
+; CHECK-NEXT:    vmov r3, r2, d5
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    adc.w r12, r2, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1576,54 +1557,57 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r12
-; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s5
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r3, r2, d2
 ; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
 ; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adc.w r4, r4, r12
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4, asr #31
-; CHECK-NEXT:    vmov.u16 r4, q0[6]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    vmov.u16 r4, q0[7]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r2, r4, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
@@ -1665,12 +1649,11 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, r12, d1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r1, r1, r12
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
@@ -1682,16 +1665,23 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
 ; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r2, lr, r2, asr #31
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1704,14 +1694,12 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 5d6a8a9ba7b67..3ac9dcb83afe2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -60,12 +60,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -97,12 +95,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -215,12 +211,10 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r12, r2, d7
+; CHECK-NEXT:    vmov r3, r1, d6
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
@@ -232,24 +226,22 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov r0, r3, d6
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.u16 r3, q2[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r0, r1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r0, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
@@ -259,33 +251,29 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d5
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -324,10 +312,8 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r1, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d4
 ; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
@@ -343,15 +329,13 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r1, r1, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adcs r2, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q1[4]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
@@ -372,35 +356,31 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d3
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov.s16 r2, q0[7]
-; CHECK-NEXT:    vmov.s16 r3, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.s16 r1, q0[7]
+; CHECK-NEXT:    vmov.s16 r2, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -461,12 +441,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -502,12 +480,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -766,6 +742,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
@@ -810,12 +788,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r1, s27
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov r12, r2, d13
+; CHECK-NEXT:    vmov r3, r1, d12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
@@ -827,24 +803,22 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q7[2], q7[0], r3, r0
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s27
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov r0, r3, d12
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d13
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[4]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q5[6]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q5[7]
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
 ; CHECK-NEXT:    vmov.u16 r3, q5[5]
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r1
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r0, r1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r0, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
@@ -854,33 +828,29 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r0
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov r0, r3, d10
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d11
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d10
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d11
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -918,35 +888,31 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov r0, r3, d6
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d7
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d6
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r0, r2
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds r1, r1, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.u16 r3, q2[5]
@@ -964,35 +930,31 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds.w lr, r1, r0
+; CHECK-NEXT:    adc.w r1, r12, r3
+; CHECK-NEXT:    vmov r3, r0, d5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -1049,10 +1011,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmov r1, s20
-; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov r1, r12, d11
+; CHECK-NEXT:    vmov r3, r2, d10
 ; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
@@ -1068,15 +1028,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r0
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds r1, r1, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adcs r2, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, r3, d10
+; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q4[4]
 ; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[7]
@@ -1097,35 +1055,31 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q5[3], q5[1], r3, r0
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov r0, r3, d8
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d9
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
-; CHECK-NEXT:    vmov.s8 r2, q0[7]
-; CHECK-NEXT:    vmov.s8 r3, q0[6]
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT:    vmov.s8 r1, q0[7]
+; CHECK-NEXT:    vmov.s8 r2, q0[6]
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
+; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d8
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d9
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q3[8]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q3[9]
@@ -1165,37 +1119,33 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d5
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov.s8 r2, q0[11]
-; CHECK-NEXT:    vmov.s8 r3, q0[10]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vmov.s8 r1, q0[11]
+; CHECK-NEXT:    vmov.s8 r2, q0[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q1[4]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d5
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.u16 r3, q1[4]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
 ; CHECK-NEXT:    vmov.u16 r3, q1[5]
@@ -1215,35 +1165,31 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d3
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov.s8 r2, q0[15]
-; CHECK-NEXT:    vmov.s8 r3, q0[14]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.s8 r1, q0[15]
+; CHECK-NEXT:    vmov.s8 r2, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1285,12 +1231,10 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r12, r2, d7
+; CHECK-NEXT:    vmov r3, r1, d6
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
@@ -1302,24 +1246,22 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov r0, r3, d6
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.u16 r3, q2[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r0, r1, #1
-; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r0, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
@@ -1329,33 +1271,29 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
-; CHECK-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-NEXT:    ubfx r1, r1, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d5
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1397,10 +1335,8 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r1, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d4
 ; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
@@ -1418,15 +1354,13 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r1, r1, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adcs r2, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, r3, d4
+; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q1[4]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
@@ -1449,37 +1383,33 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %b) {
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r0, d3
 ; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer
@@ -1543,12 +1473,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1584,12 +1512,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-NEXT:    asrs r1, r1, #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1602,11 +1528,9 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, r2, d2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -1617,12 +1541,10 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) {
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
@@ -1695,14 +1617,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -1737,14 +1657,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -1865,34 +1783,30 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov q4[2], q4[0], r3, r12
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r12, s15
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov lr, s14
-; CHECK-NEXT:    orr.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    add lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    vmov r12, lr, d7
+; CHECK-NEXT:    vmov r3, r4, d6
+; CHECK-NEXT:    orr.w lr, lr, r4
+; CHECK-NEXT:    ubfx r4, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
+; CHECK-NEXT:    add r3, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r4
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r4, q0[2]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov r2, r4, d6
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    adc.w r3, lr, r4
+; CHECK-NEXT:    vmov r4, r2, d7
+; CHECK-NEXT:    adds.w lr, r12, r4
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov.u16 r3, q2[6]
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.u16 r3, q2[5]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
@@ -1909,18 +1823,16 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds.w lr, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, r4, d5
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r4, r4, r12
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
@@ -1928,16 +1840,14 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w r12, lr, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d1
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -1979,16 +1889,14 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds r5, r4, r2
-; CHECK-NEXT:    ubfx r4, lr, #12, #1
+; CHECK-NEXT:    vmov r2, r12, d5
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    adds r5, r3, r2
 ; CHECK-NEXT:    ubfx r2, lr, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adc.w r3, r4, r12
+; CHECK-NEXT:    ubfx r4, lr, #12, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
 ; CHECK-NEXT:    vmov.s16 r2, q0[3]
@@ -1998,13 +1906,11 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adcs r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r5, r2
+; CHECK-NEXT:    vmov r2, r4, d4
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r12, r2, r5
 ; CHECK-NEXT:    vmov.u16 r5, q1[6]
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov.u16 r4, q1[4]
@@ -2027,37 +1933,33 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r2, r4, d2
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d3
 ; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    ubfx r3, r5, #12, #1
 ; CHECK-NEXT:    ubfx r5, r5, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
-; CHECK-NEXT:    vmov.s16 r5, q0[7]
-; CHECK-NEXT:    vmov.s16 r4, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
+; CHECK-NEXT:    vmov.s16 r3, q0[7]
+; CHECK-NEXT:    vmov.s16 r5, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
+; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -2071,6 +1973,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, i64 %a) {
 ; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -2087,15 +1991,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    orr.w r12, r3, r2
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r12, lr, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    orr.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = zext <2 x i16> %x to <2 x i64>
@@ -2133,14 +2035,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -2304,8 +2204,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
@@ -2350,34 +2250,30 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q7[2], q7[0], r3, r12
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    vmov lr, s26
-; CHECK-NEXT:    orr.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    add lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    vmov r12, lr, d13
+; CHECK-NEXT:    vmov r3, r4, d12
+; CHECK-NEXT:    orr.w lr, lr, r4
+; CHECK-NEXT:    ubfx r4, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
+; CHECK-NEXT:    add r3, r12
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r4
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r4
 ; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r4, q0[2]
+; CHECK-NEXT:    vmov q7[2], q7[0], r4, r2
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[4]
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov.u16 r2, q5[6]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov r2, r4, d12
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    adc.w r3, lr, r4
+; CHECK-NEXT:    vmov r4, r2, d13
+; CHECK-NEXT:    adds.w lr, r12, r4
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov.u16 r3, q5[6]
+; CHECK-NEXT:    vmov.u16 r2, q5[4]
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q5[7]
 ; CHECK-NEXT:    vmov.u16 r3, q5[5]
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
@@ -2394,18 +2290,16 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds.w lr, lr, r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, r4, d10
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, r4, d11
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r4, r4, r12
 ; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
 ; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
@@ -2413,14 +2307,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    adc.w r4, r12, r2
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    adds.w r12, lr, r3
-; CHECK-NEXT:    adc.w lr, r4, r2
+; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    adds.w r12, lr, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d11
+; CHECK-NEXT:    adds.w lr, r12, r4
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -2440,101 +2332,93 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.u16 r4, q2[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    vmov.u16 r3, q2[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.u16 r4, q2[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
+; CHECK-NEXT:    vmov.u16 r3, q2[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r4
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
 ; CHECK-NEXT:    vmov.u8 r4, q0[8]
 ; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    adds.w r5, r12, r4
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    ubfx r4, r2, #12, #1
+; CHECK-NEXT:    vmov r3, r4, d6
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, r4, d7
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r3, r3, #0
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r4
+; CHECK-NEXT:    adc.w r4, r4, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r2
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov.u16 r4, q2[4]
-; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    adds.w r12, lr, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d7
+; CHECK-NEXT:    adds.w lr, r12, r4
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q2[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    vmov.u16 r4, q2[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.u16 r3, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r3, p0
-; CHECK-NEXT:    and r2, r3, #1
-; CHECK-NEXT:    ubfx r4, r3, #4, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[13]
 ; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds r2, r2, r5
-; CHECK-NEXT:    adc.w r5, r12, r4
-; CHECK-NEXT:    ubfx r4, r3, #12, #1
-; CHECK-NEXT:    ubfx r3, r3, #8, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, r4, d5
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r4, r4, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w r12, lr, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d1
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -2594,16 +2478,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds r5, r4, r2
-; CHECK-NEXT:    ubfx r4, lr, #12, #1
+; CHECK-NEXT:    vmov r2, r12, d11
+; CHECK-NEXT:    vmov r3, r4, d10
+; CHECK-NEXT:    adds r5, r3, r2
 ; CHECK-NEXT:    ubfx r2, lr, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adc.w r3, r4, r12
+; CHECK-NEXT:    ubfx r4, lr, #12, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmov q5[2], q5[0], r2, r4
 ; CHECK-NEXT:    vmov q5[3], q5[1], r2, r4
 ; CHECK-NEXT:    vmov.s8 r2, q0[3]
@@ -2613,13 +2495,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q6[3], q6[1], r4, r2
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    adcs r3, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r5, r2
+; CHECK-NEXT:    vmov r2, r4, d10
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r5, r4, d11
+; CHECK-NEXT:    adds.w r12, r2, r5
 ; CHECK-NEXT:    vmov.u16 r5, q4[6]
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov.u16 r4, q4[4]
@@ -2642,35 +2522,31 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q5[3], q5[1], r4, r2
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov r2, r4, d8
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d9
 ; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    ubfx r3, r5, #12, #1
 ; CHECK-NEXT:    ubfx r5, r5, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
-; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
-; CHECK-NEXT:    vmov.s8 r5, q0[7]
-; CHECK-NEXT:    vmov.s8 r4, q0[6]
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov q4[2], q4[0], r5, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r5, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[7]
+; CHECK-NEXT:    vmov.s8 r5, q0[6]
+; CHECK-NEXT:    vmov q5[2], q5[0], r5, r3
+; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r5
+; CHECK-NEXT:    vmov q5[3], q5[1], r5, r3
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r5, s17
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s19
-; CHECK-NEXT:    adds.w r12, r2, r4
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r3, r5, d8
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r5, r2
+; CHECK-NEXT:    vmov r2, r4, d9
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    adc.w r3, r5, r4
 ; CHECK-NEXT:    vmov.u8 r5, q3[8]
 ; CHECK-NEXT:    vmov.16 q4[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q3[9]
@@ -2710,37 +2586,33 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r2, r4, d4
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d5
 ; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    ubfx r3, r5, #12, #1
 ; CHECK-NEXT:    ubfx r5, r5, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r5, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
-; CHECK-NEXT:    vmov.s8 r5, q0[11]
-; CHECK-NEXT:    vmov.s8 r4, q0[10]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r5, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[11]
+; CHECK-NEXT:    vmov.s8 r5, q0[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
+; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov q3[3], q3[1], r5, r3
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds.w r12, r2, r4
-; CHECK-NEXT:    vmov.u16 r4, q1[4]
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r3, r5, d4
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r5, r2
+; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    adc.w r3, r5, r4
 ; CHECK-NEXT:    vmov.u16 r5, q1[6]
+; CHECK-NEXT:    vmov.u16 r4, q1[4]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
 ; CHECK-NEXT:    vmov.u16 r5, q1[7]
 ; CHECK-NEXT:    vmov.u16 r4, q1[5]
@@ -2760,37 +2632,33 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    asrs r4, r4, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r2, r4, d2
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, r2, d3
 ; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    ubfx r3, r5, #12, #1
 ; CHECK-NEXT:    ubfx r5, r5, #8, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
-; CHECK-NEXT:    vmov.s8 r5, q0[15]
-; CHECK-NEXT:    vmov.s8 r4, q0[14]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
+; CHECK-NEXT:    vmov.s8 r3, q0[15]
+; CHECK-NEXT:    vmov.s8 r5, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
+; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2805,6 +2673,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -2821,15 +2691,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    orr.w r12, r3, r2
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r12, lr, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    orr.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = zext <2 x i8> %x to <2 x i64>
@@ -2867,14 +2735,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -2890,29 +2756,25 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r12, s5
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    vmov r3, r2, d2
+; CHECK-NEXT:    csetm r12, ne
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    orrs.w r3, r3, r12
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r12
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
index 699e2acf57ae7..8191cd90acebf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
@@ -16,11 +16,9 @@ entry:
 define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: and_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -33,11 +31,9 @@ define arm_aapcs_vfpcc i32 @and_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: and_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -49,11 +45,9 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: and_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -170,12 +164,10 @@ entry:
 define arm_aapcs_vfpcc i64 @and_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: and_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    ands r0, r2
+; CHECK-NEXT:    ands r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
@@ -186,12 +178,10 @@ define arm_aapcs_vfpcc i64 @and_v4i64(<4 x i64> %x) {
 ; CHECK-LABEL: and_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    ands r0, r2
+; CHECK-NEXT:    ands r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
@@ -215,11 +205,9 @@ entry:
 define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: and_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
@@ -234,11 +222,9 @@ define arm_aapcs_vfpcc i32 @and_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: and_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
@@ -252,11 +238,9 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: and_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
@@ -388,15 +372,15 @@ entry:
 define arm_aapcs_vfpcc i64 @and_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-LABEL: and_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    ands r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ands r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    ands r2, r3
+; CHECK-NEXT:    and.w r2, lr, r12
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
   %r = and i64 %y, %z
@@ -406,16 +390,16 @@ entry:
 define arm_aapcs_vfpcc i64 @and_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-LABEL: and_v4i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    ands r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ands r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    ands r2, r3
+; CHECK-NEXT:    and.w r2, lr, r12
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
   %r = and i64 %y, %z
@@ -437,11 +421,9 @@ entry:
 define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: or_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -454,11 +436,9 @@ define arm_aapcs_vfpcc i32 @or_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: or_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -470,11 +450,9 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: or_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -591,12 +569,10 @@ entry:
 define arm_aapcs_vfpcc i64 @or_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: or_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
@@ -607,12 +583,10 @@ define arm_aapcs_vfpcc i64 @or_v4i64(<4 x i64> %x) {
 ; CHECK-LABEL: or_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
@@ -636,11 +610,9 @@ entry:
 define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: or_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
@@ -655,11 +627,9 @@ define arm_aapcs_vfpcc i32 @or_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: or_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
@@ -673,11 +643,9 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: or_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
@@ -809,15 +777,15 @@ entry:
 define arm_aapcs_vfpcc i64 @or_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-LABEL: or_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orrs r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    orr.w r2, lr, r12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
   %r = or i64 %y, %z
@@ -827,16 +795,16 @@ entry:
 define arm_aapcs_vfpcc i64 @or_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-LABEL: or_v4i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orrs r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    orr.w r2, lr, r12
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
   %r = or i64 %y, %z
@@ -858,11 +826,9 @@ entry:
 define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: xor_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -875,11 +841,9 @@ define arm_aapcs_vfpcc i32 @xor_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: xor_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -891,11 +855,9 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: xor_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1012,12 +974,10 @@ entry:
 define arm_aapcs_vfpcc i64 @xor_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: xor_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
@@ -1028,12 +988,10 @@ define arm_aapcs_vfpcc i64 @xor_v4i64(<4 x i64> %x) {
 ; CHECK-LABEL: xor_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
@@ -1057,11 +1015,9 @@ entry:
 define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: xor_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
@@ -1076,11 +1032,9 @@ define arm_aapcs_vfpcc i32 @xor_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: xor_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
@@ -1094,11 +1048,9 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: xor_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
@@ -1230,15 +1182,15 @@ entry:
 define arm_aapcs_vfpcc i64 @xor_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-LABEL: xor_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    eors r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, lr, r12
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
   %r = xor i64 %y, %z
@@ -1248,16 +1200,16 @@ entry:
 define arm_aapcs_vfpcc i64 @xor_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-LABEL: xor_v4i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
 ; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    eors r0, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    eor.w r2, lr, r12
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
   %r = xor i64 %y, %z

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 4393e4646bab5..e7886dca32f59 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -96,8 +96,8 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-LABEL: mul_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB1_8
@@ -119,14 +119,12 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov lr, r3, d1
 ; CHECK-NEXT:    cmp r12, r1
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    mul lr, r3, r2
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    mul r3, lr, r3
+; CHECK-NEXT:    mul r2, r4, r2
 ; CHECK-NEXT:    mul r2, r3, r2
-; CHECK-NEXT:    mul r2, r2, lr
 ; CHECK-NEXT:    beq .LBB1_8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r12
@@ -138,7 +136,7 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    le lr, .LBB1_7
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -190,8 +188,8 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-LABEL: and_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
@@ -217,13 +215,11 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB2_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov lr, r12, d1
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    and.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    and.w r2, r2, lr
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    and.w r12, r12, lr
+; CHECK-NEXT:    and.w r2, r2, r4
 ; CHECK-NEXT:    and.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB2_9
 ; CHECK-NEXT:  .LBB2_7: @ %for.body.preheader1
@@ -236,7 +232,7 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    le lr, .LBB2_8
 ; CHECK-NEXT:  .LBB2_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -288,8 +284,8 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-LABEL: or_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB3_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
@@ -315,13 +311,11 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov lr, r12, d1
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    orr.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    orr.w r2, r2, lr
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    orr.w r12, r12, lr
+; CHECK-NEXT:    orr.w r2, r2, r4
 ; CHECK-NEXT:    orr.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB3_9
 ; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader1
@@ -334,7 +328,7 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    le lr, .LBB3_8
 ; CHECK-NEXT:  .LBB3_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
@@ -386,8 +380,8 @@ for.cond.cleanup:                                 ; preds = %for.body, %middle.b
 define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-LABEL: xor_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB4_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
@@ -413,13 +407,11 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    veor q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB4_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov lr, r12, d1
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    eor.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eor.w r2, r2, lr
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    eor.w r12, r12, lr
+; CHECK-NEXT:    eor.w r2, r2, r4
 ; CHECK-NEXT:    eor.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB4_9
 ; CHECK-NEXT:  .LBB4_7: @ %for.body.preheader1
@@ -432,7 +424,7 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    le lr, .LBB4_8
 ; CHECK-NEXT:  .LBB4_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index 3b3471476dfad..83d348f516665 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -42,12 +42,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: add_v2i32_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmullb.u32 q2, q0, q1
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
@@ -61,12 +59,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: add_v2i32_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q1
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
@@ -247,12 +243,18 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umlal r0, r1, r3, r2
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
@@ -265,16 +267,22 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    smlal r0, r1, r3, r2
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
@@ -549,8 +557,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
@@ -569,159 +577,161 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    umull r0, r1, r1, r0
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov q5[2], q5[0], r1, r3
+; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vand q5, q5, q2
 ; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov lr, s12
-; CHECK-NEXT:    vmov r12, s13
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    smlabb r0, r4, r3, r0
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov lr, r12, d6
+; CHECK-NEXT:    umull r1, r2, r1, r2
+; CHECK-NEXT:    smlabb r0, r0, r3, r1
 ; CHECK-NEXT:    adds.w r0, r0, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[4]
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adc.w r1, r2, r12
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    adc.w lr, r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[4]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[6]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[10]
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[15]
 ; CHECK-NEXT:    vmov.u8 r3, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r3
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    umlal r0, r1, r3, r2
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    umull r0, r2, r0, r2
+; CHECK-NEXT:    umull r1, r3, r1, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    umlal r0, r1, r3, r2
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -743,27 +753,25 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov r0, r1, d5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    vmov.s8 r2, q1[3]
+; CHECK-NEXT:    adc.w lr, r3, r1
 ; CHECK-NEXT:    vmov.s8 r3, q0[3]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.s8 r1, q1[3]
 ; CHECK-NEXT:    vmov.s8 r0, q1[2]
-; CHECK-NEXT:    vmov.s8 r2, q0[2]
-; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r2
+; CHECK-NEXT:    vmov.s8 r1, q0[2]
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[5]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[5]
 ; CHECK-NEXT:    vmov.s8 r0, q1[4]
 ; CHECK-NEXT:    vmov.s8 r1, q0[4]
@@ -771,14 +779,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r2
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[7]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[7]
 ; CHECK-NEXT:    vmov.s8 r0, q1[6]
 ; CHECK-NEXT:    vmov.s8 r1, q0[6]
@@ -786,14 +793,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r2
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[9]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[9]
 ; CHECK-NEXT:    vmov.s8 r0, q1[8]
 ; CHECK-NEXT:    vmov.s8 r1, q0[8]
@@ -801,14 +807,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r2
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[11]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[11]
 ; CHECK-NEXT:    vmov.s8 r0, q1[10]
 ; CHECK-NEXT:    vmov.s8 r1, q0[10]
@@ -816,14 +821,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r2
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[13]
-; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[13]
 ; CHECK-NEXT:    vmov.s8 r0, q1[12]
 ; CHECK-NEXT:    vmov.s8 r1, q0[12]
@@ -831,19 +835,26 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.s8 r2, q1[14]
-; CHECK-NEXT:    vmov.s8 r3, q0[14]
-; CHECK-NEXT:    smlal r0, r1, r3, r2
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r2, r3, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[15]
 ; CHECK-NEXT:    vmov.s8 r3, q0[15]
-; CHECK-NEXT:    smlal r0, r1, r3, r2
+; CHECK-NEXT:    vmov.s8 r0, q1[14]
+; CHECK-NEXT:    vmov.s8 r1, q0[14]
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -942,16 +953,22 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smlal r0, r1, r3, r2
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
@@ -964,25 +981,25 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    umull r12, r2, r1, r0
-; CHECK-NEXT:    mla r1, r1, r3, r2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    mla lr, r2, r0, r1
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    umull r3, r1, r2, r0
-; CHECK-NEXT:    mla r1, r2, r4, r1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    mla r1, r2, r0, r1
-; CHECK-NEXT:    adds.w r0, r12, r3
-; CHECK-NEXT:    adc.w r1, r1, lr
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, lr, d1
+; CHECK-NEXT:    vmov r4, r9, d2
+; CHECK-NEXT:    vmov r6, r7, d0
+; CHECK-NEXT:    umull r1, r8, r2, r0
+; CHECK-NEXT:    umull r3, r5, r6, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    mla r1, r2, r12, r8
+; CHECK-NEXT:    mla r1, lr, r0, r1
+; CHECK-NEXT:    mla r0, r6, r9, r5
+; CHECK-NEXT:    mla r0, r7, r4, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %m = mul <2 x i64> %x, %y
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
@@ -1035,14 +1052,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmullb.u32 q2, q0, q1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov lr, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d4
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
@@ -1059,14 +1074,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov lr, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d4
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
@@ -1230,14 +1243,20 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s6
-; CHECK-NEXT:    umull r2, lr, r3, r2
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    umlal r2, lr, r3, r12
+; CHECK-NEXT:    vmov lr, s4
+; CHECK-NEXT:    umull r12, r3, r3, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    umull r2, lr, r2, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r2, lr, d0
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
@@ -1253,18 +1272,24 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    smull r2, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    sxth.w lr, r3
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    sxth r2, r2
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    smlal r2, r12, r3, lr
+; CHECK-NEXT:    smull r12, r3, r3, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    sxth.w lr, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    smull r2, lr, r2, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r2, lr, d0
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
@@ -1462,8 +1487,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.u8 r2, q1[3]
@@ -1477,166 +1502,168 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r12, s14
 ; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov.u8 r4, q1[0]
+; CHECK-NEXT:    vmov.u8 r4, q0[0]
 ; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov.u8 r5, q0[0]
 ; CHECK-NEXT:    umull lr, r12, r2, r12
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    umull r2, r3, r2, r3
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, lr
-; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q0[1]
-; CHECK-NEXT:    vmov q5[2], q5[0], r5, r4
+; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r12
+; CHECK-NEXT:    vmov.u8 r3, q1[1]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[1]
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vand q5, q5, q2
 ; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r12
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    vmov r6, s22
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov lr, s13
-; CHECK-NEXT:    umull r2, r4, r4, r2
-; CHECK-NEXT:    smlabb r2, r6, r5, r2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov.u8 r5, q1[4]
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r4, lr
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    adds.w lr, r2, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[5]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    vmov lr, r12, d6
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    smlabb r2, r5, r4, r2
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[5]
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
 ; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
+; CHECK-NEXT:    vmov r5, s14
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds.w r6, r6, lr
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[7]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[7]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[7]
+; CHECK-NEXT:    vmov.u8 r4, q1[6]
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[9]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[8]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[9]
+; CHECK-NEXT:    vmov.u8 r4, q1[8]
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[11]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[10]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[11]
+; CHECK-NEXT:    vmov.u8 r4, q1[10]
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[13]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[13]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[13]
+; CHECK-NEXT:    vmov.u8 r4, q1[12]
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[15]
-; CHECK-NEXT:    adcs r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[14]
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[15]
+; CHECK-NEXT:    vmov.u8 r4, q1[14]
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q0[15]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    umlal r3, r2, r5, r6
-; CHECK-NEXT:    vmov r6, s6
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    umlal r3, r2, r5, r6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r2, r5, r2, r5
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -1653,48 +1680,45 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov.s8 r2, q1[1]
 ; CHECK-NEXT:    vmov.s8 r3, q0[1]
-; CHECK-NEXT:    smull r12, r3, r3, r2
-; CHECK-NEXT:    vmov.s8 lr, q1[0]
+; CHECK-NEXT:    smull lr, r12, r3, r2
+; CHECK-NEXT:    vmov.s8 r3, q1[0]
 ; CHECK-NEXT:    vmov.s8 r2, q0[0]
 ; CHECK-NEXT:    vmov.s8 r4, q1[2]
 ; CHECK-NEXT:    vmov.s8 r5, q0[2]
-; CHECK-NEXT:    smull r2, lr, r2, lr
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r12
+; CHECK-NEXT:    smull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, lr
 ; CHECK-NEXT:    smull r4, r5, r5, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], lr, r3
-; CHECK-NEXT:    vmov lr, s10
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r12, s9
-; CHECK-NEXT:    adds.w lr, lr, r2
-; CHECK-NEXT:    vmov.s8 r2, q1[3]
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r12
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d4
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov.s8 r3, q0[3]
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov.s8 r2, q1[3]
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
 ; CHECK-NEXT:    vmov q2[3], q2[1], r5, r3
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r5, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r5, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
+; CHECK-NEXT:    vmov.s8 r5, q1[5]
 ; CHECK-NEXT:    vmov.s8 r4, q0[5]
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov.s8 r3, q1[5]
 ; CHECK-NEXT:    vmov.s8 r2, q1[4]
-; CHECK-NEXT:    vmov.s8 r5, q0[4]
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov.s8 r3, q0[4]
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
 ; CHECK-NEXT:    vmov.s8 r5, q1[7]
-; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.s8 r4, q0[7]
 ; CHECK-NEXT:    vmov.s8 r2, q1[6]
 ; CHECK-NEXT:    vmov.s8 r3, q0[6]
@@ -1702,14 +1726,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
 ; CHECK-NEXT:    vmov.s8 r5, q1[9]
-; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.s8 r4, q0[9]
 ; CHECK-NEXT:    vmov.s8 r2, q1[8]
 ; CHECK-NEXT:    vmov.s8 r3, q0[8]
@@ -1717,14 +1740,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
 ; CHECK-NEXT:    vmov.s8 r5, q1[11]
-; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.s8 r4, q0[11]
 ; CHECK-NEXT:    vmov.s8 r2, q1[10]
 ; CHECK-NEXT:    vmov.s8 r3, q0[10]
@@ -1732,14 +1754,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
 ; CHECK-NEXT:    vmov.s8 r5, q1[13]
-; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.s8 r4, q0[13]
 ; CHECK-NEXT:    vmov.s8 r2, q1[12]
 ; CHECK-NEXT:    vmov.s8 r3, q0[12]
@@ -1747,21 +1768,28 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
 ; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    vmov.s8 r5, q1[14]
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov.s8 r4, q0[14]
-; CHECK-NEXT:    smlal r3, r2, r4, r5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    adc.w lr, r3, r4
 ; CHECK-NEXT:    vmov.s8 r5, q1[15]
 ; CHECK-NEXT:    vmov.s8 r4, q0[15]
-; CHECK-NEXT:    smlal r3, r2, r4, r5
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov.s8 r2, q1[14]
+; CHECK-NEXT:    vmov.s8 r3, q0[14]
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -1805,18 +1833,24 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i6
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    sxtb.w lr, r3
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smlal r2, r12, r3, lr
+; CHECK-NEXT:    smull r12, r3, r3, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    sxtb.w lr, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    smull r2, lr, r2, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r2, lr, d0
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
@@ -1830,27 +1864,27 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
 ; CHECK-LABEL: add_v2i64_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    umull lr, r12, r3, r2
-; CHECK-NEXT:    mla r3, r3, r4, r12
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    mla r12, r4, r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    umull r2, r5, r4, r3
-; CHECK-NEXT:    mla r4, r4, r6, r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds.w r2, r2, lr
-; CHECK-NEXT:    mla r3, r5, r3, r4
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    vmov r2, r12, d3
+; CHECK-NEXT:    vmov r3, lr, d1
+; CHECK-NEXT:    vmov r6, r9, d2
+; CHECK-NEXT:    vmov r5, r11, d0
+; CHECK-NEXT:    umull r10, r8, r3, r2
+; CHECK-NEXT:    umull r4, r7, r5, r6
+; CHECK-NEXT:    mla r3, r3, r12, r8
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r10
+; CHECK-NEXT:    mla r2, lr, r2, r3
+; CHECK-NEXT:    mla r3, r5, r9, r7
+; CHECK-NEXT:    mla r3, r11, r6, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r7, r6, d0
+; CHECK-NEXT:    adds r3, r3, r7
+; CHECK-NEXT:    adcs r2, r6
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %m = mul <2 x i64> %x, %y
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 1222a2d3cb85e..ef0ec808e1b85 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -64,12 +64,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -98,12 +96,10 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -348,12 +344,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -395,12 +389,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -836,18 +828,16 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
 ; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w lr, r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q4[2]
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r1, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w lr, r3, r1
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r0, r0, #0
+; CHECK-NEXT:    vmov.u8 r1, q4[2]
 ; CHECK-NEXT:    vmov q7[2], q7[0], r0, r3
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov q7[3], q7[1], r0, r3
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.u8 r3, q1[2]
@@ -866,17 +856,15 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vand q0, q0, q7
 ; CHECK-NEXT:    vmov q7, q4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q4[4]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds.w r0, r0, lr
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u16 r2, q6[6]
 ; CHECK-NEXT:    vmov.u16 r3, q6[4]
+; CHECK-NEXT:    vmov.u8 r1, q4[4]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[7]
 ; CHECK-NEXT:    vmov.u16 r3, q6[5]
@@ -905,23 +893,21 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r1
 ; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r3, r4, d1
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q4[6]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
 ; CHECK-NEXT:    vmov.u8 r3, q1[6]
+; CHECK-NEXT:    vmov.u8 r4, q4[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q4[7]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
@@ -937,14 +923,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r0, r2, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[9]
@@ -995,23 +979,21 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r3, r4, d1
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q7[10]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
 ; CHECK-NEXT:    vmov.u8 r3, q1[10]
+; CHECK-NEXT:    vmov.u8 r4, q7[10]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q7[11]
 ; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
@@ -1026,13 +1008,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    vmov r0, r2, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[6]
 ; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q3[4]
@@ -1065,23 +1045,21 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r3, r4, d1
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q7[14]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[15]
 ; CHECK-NEXT:    vmov.u8 r3, q1[14]
+; CHECK-NEXT:    vmov.u8 r4, q7[14]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q7[15]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
@@ -1096,12 +1074,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r0, r2, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    add sp, #40
@@ -1170,11 +1146,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
 ; CHECK-NEXT:    vmov q7[3], q7[1], r3, r12
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov r1, s24
-; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    adds.w lr, r1, r3
+; CHECK-NEXT:    vmov r1, r12, d13
+; CHECK-NEXT:    vmov r3, r2, d12
+; CHECK-NEXT:    adds.w lr, r3, r1
 ; CHECK-NEXT:    ubfx r3, r0, #12, #1
 ; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsb.w r3, r3, #0
@@ -1191,21 +1165,19 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q7[2], q7[0], r1, r0
 ; CHECK-NEXT:    vmov q7[3], q7[1], r2, r3
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r1, s24
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.s8 r1, q1[4]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    vmov r0, r1, d12
+; CHECK-NEXT:    vmov r2, r3, d13
+; CHECK-NEXT:    adds.w r0, r0, lr
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u16 r2, q5[6]
 ; CHECK-NEXT:    vmov.u16 r3, q5[4]
-; CHECK-NEXT:    smull r1, r4, r4, r1
+; CHECK-NEXT:    vmov.s8 r1, q1[4]
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[7]
 ; CHECK-NEXT:    vmov.u16 r3, q5[5]
+; CHECK-NEXT:    smull r1, r4, r4, r1
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
@@ -1221,38 +1193,32 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q6[2], q6[0], r1, r0
 ; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r1, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[6]
-; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r3, r4, d11
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.s8 r0, q0[6]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
-; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[7]
 ; CHECK-NEXT:    vmov.s8 r3, q0[7]
+; CHECK-NEXT:    vmov.s8 r4, q1[6]
+; CHECK-NEXT:    vmov.s8 r0, q0[6]
 ; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
 ; CHECK-NEXT:    vmov q6[2], q6[0], r0, r2
 ; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.s8 r0, q1[8]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    vmov r0, r2, d10
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -1269,15 +1235,17 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.s8 r1, q0[8]
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
-; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.s8 r0, q1[8]
 ; CHECK-NEXT:    vpsel q2, q3, q2
+; CHECK-NEXT:    vmov.s8 r1, q0[8]
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
 ; CHECK-NEXT:    vmov.u16 r3, q2[0]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
 ; CHECK-NEXT:    vmov.u16 r3, q2[1]
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
@@ -1293,47 +1261,43 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
 ; CHECK-NEXT:    vmov q4[3], q4[1], r1, r4
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[10]
-; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov r3, r4, d7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.s8 r0, q0[10]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[11]
 ; CHECK-NEXT:    vmov.s8 r3, q0[11]
+; CHECK-NEXT:    vmov.s8 r4, q1[10]
+; CHECK-NEXT:    vmov.s8 r0, q0[10]
 ; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
 ; CHECK-NEXT:    vmov q4[2], q4[0], r0, r2
 ; CHECK-NEXT:    vmov q4[3], q4[1], r4, r3
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.s8 r0, q1[12]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    vmov r0, r2, d6
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d7
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    vmov.s8 r1, q0[12]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.u16 r3, q2[5]
-; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.s8 r0, q1[12]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.s8 r1, q0[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r4, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
@@ -1347,37 +1311,33 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
 ; CHECK-NEXT:    vmov q3[3], q3[1], r1, r4
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[14]
-; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov r0, r1, d4
+; CHECK-NEXT:    vmov r3, r4, d5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adc.w r1, r1, lr
+; CHECK-NEXT:    adds.w r12, r0, r3
 ; CHECK-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.s8 r0, q0[14]
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    adcs r1, r4
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[15]
 ; CHECK-NEXT:    vmov.s8 r3, q0[15]
+; CHECK-NEXT:    vmov.s8 r4, q1[14]
+; CHECK-NEXT:    vmov.s8 r0, q0[14]
 ; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r0, r2, d0
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -1498,12 +1458,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1545,12 +1503,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1565,29 +1521,23 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r4, r5, r2, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, lr
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    mla r1, r1, r4, r12
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    mla r0, r4, r0, r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    mla r1, r2, r1, r5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    mla r1, r2, r3, r1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vmov r0, r12, d3
+; CHECK-NEXT:    vmov r2, lr, d1
+; CHECK-NEXT:    vmov r4, r9, d2
+; CHECK-NEXT:    vmov r6, r7, d0
+; CHECK-NEXT:    umull r1, r8, r2, r0
+; CHECK-NEXT:    umull r3, r5, r6, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    mla r1, r2, r12, r8
+; CHECK-NEXT:    mla r0, lr, r0, r1
+; CHECK-NEXT:    mla r1, r6, r9, r5
+; CHECK-NEXT:    mla r1, r7, r4, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov r0, r1, d5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
@@ -1595,16 +1545,14 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64
 ; CHECK-NEXT:    cset r1, eq
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
@@ -1681,14 +1629,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -1720,14 +1666,12 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
@@ -1947,14 +1891,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -1999,14 +1941,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -2311,18 +2251,16 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r12
 ; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds r6, r2, r3
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds r6, r3, r2
 ; CHECK-NEXT:    ubfx r2, lr, #12, #1
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q4[2]
 ; CHECK-NEXT:    adc.w r12, r12, r4
 ; CHECK-NEXT:    ubfx r4, lr, #8, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    vmov q7[2], q7[0], r4, r2
+; CHECK-NEXT:    vmov.u8 r3, q4[2]
 ; CHECK-NEXT:    vmov q7[3], q7[1], r4, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[3]
 ; CHECK-NEXT:    vmov.u8 r4, q1[2]
@@ -2342,16 +2280,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u8 r4, q4[4]
 ; CHECK-NEXT:    vand q0, q0, q7
 ; CHECK-NEXT:    vmov q7, q4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    vmov r6, r5, d1
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q6[6]
 ; CHECK-NEXT:    vmov.u16 r6, q6[4]
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[7]
 ; CHECK-NEXT:    vmov.u16 r6, q6[5]
@@ -2380,20 +2316,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r6, lr, r4
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q4[6]
+; CHECK-NEXT:    adc.w r12, r6, r4
 ; CHECK-NEXT:    ubfx r6, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmov q6[2], q6[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q4[6]
 ; CHECK-NEXT:    vmov q6[3], q6[1], r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
 ; CHECK-NEXT:    vmov.u8 r6, q1[6]
@@ -2413,15 +2347,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
 ; CHECK-NEXT:    vmov.u8 r4, q7[8]
 ; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r6
+; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[9]
@@ -2438,6 +2369,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q6[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[15]
 ; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
 ; CHECK-NEXT:    vpsel q3, q3, q0
 ; CHECK-NEXT:    vmov.u16 r2, q3[2]
@@ -2470,20 +2402,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r6, lr, r4
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q7[10]
+; CHECK-NEXT:    adc.w r12, r6, r4
 ; CHECK-NEXT:    ubfx r6, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[10]
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
 ; CHECK-NEXT:    vmov.u8 r6, q1[10]
@@ -2502,16 +2432,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
 ; CHECK-NEXT:    vmov.u8 r4, q7[12]
 ; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r6
+; CHECK-NEXT:    vmov r6, r5, d1
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q3[6]
 ; CHECK-NEXT:    vmov.u16 r6, q3[4]
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[7]
 ; CHECK-NEXT:    vmov.u16 r6, q3[5]
@@ -2540,20 +2468,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r3, r4, d0
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r6, lr, r4
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q7[14]
+; CHECK-NEXT:    adc.w r12, r6, r4
 ; CHECK-NEXT:    ubfx r6, r2, #12, #1
 ; CHECK-NEXT:    ubfx r2, r2, #8, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[14]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q1[15]
 ; CHECK-NEXT:    vmov.u8 r6, q1[14]
@@ -2571,16 +2497,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
 ; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    adcs r2, r6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r6
+; CHECK-NEXT:    vmov r6, r5, d1
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -2648,11 +2572,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
 ; CHECK-NEXT:    vmov q7[3], q7[1], r4, lr
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r4, s26
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov lr, s27
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    adds r6, r2, r4
+; CHECK-NEXT:    vmov r2, lr, d13
+; CHECK-NEXT:    vmov r4, r3, d12
+; CHECK-NEXT:    adds r6, r4, r2
 ; CHECK-NEXT:    ubfx r4, r12, #12, #1
 ; CHECK-NEXT:    ubfx r2, r12, #8, #1
 ; CHECK-NEXT:    rsb.w r4, r4, #0
@@ -2668,24 +2590,22 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
 ; CHECK-NEXT:    vmov q7[3], q7[1], r5, r4
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov r5, s27
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    adc.w r6, lr, r2
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    adds.w r12, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q1[4]
-; CHECK-NEXT:    adc.w lr, r6, r5
+; CHECK-NEXT:    vmov r2, r3, d12
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    vmov r6, r5, d13
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u16 r6, q5[6]
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vmov.u16 r5, q5[4]
-; CHECK-NEXT:    vmov.s8 r3, q0[4]
 ; CHECK-NEXT:    vmov q6[2], q6[0], r5, r6
 ; CHECK-NEXT:    vmov.u16 r6, q5[7]
 ; CHECK-NEXT:    vmov.u16 r5, q5[5]
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r2, q1[4]
 ; CHECK-NEXT:    vmov q6[3], q6[1], r5, r6
+; CHECK-NEXT:    vmov.s8 r3, q0[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmrs r6, p0
 ; CHECK-NEXT:    and r4, r6, #1
 ; CHECK-NEXT:    ubfx r5, r6, #4, #1
@@ -2699,38 +2619,32 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q6[2], q6[0], r2, r5
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r4
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov r5, s23
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r2, r2, lr
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[6]
-; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov r5, r4, d11
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
 ; CHECK-NEXT:    ubfx r5, r6, #12, #1
 ; CHECK-NEXT:    ubfx r6, r6, #8, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.s8 r2, q0[6]
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    rsb.w r6, r6, #0
 ; CHECK-NEXT:    vmov q5[2], q5[0], r6, r5
-; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov q5[3], q5[1], r6, r5
 ; CHECK-NEXT:    vmov.s8 r6, q1[7]
 ; CHECK-NEXT:    vmov.s8 r5, q0[7]
+; CHECK-NEXT:    vmov.s8 r4, q1[6]
+; CHECK-NEXT:    vmov.s8 r2, q0[6]
 ; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    smull r2, r4, r2, r4
 ; CHECK-NEXT:    vmov q6[2], q6[0], r2, r6
 ; CHECK-NEXT:    vmov q6[3], q6[1], r4, r5
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r6, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r5, s23
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    adc.w r6, r12, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q1[8]
-; CHECK-NEXT:    adc.w lr, r6, r5
+; CHECK-NEXT:    vmov r2, r6, d10
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r6
+; CHECK-NEXT:    vmov r6, r5, d11
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u8 r6, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r6
 ; CHECK-NEXT:    vmov.u8 r6, q4[9]
@@ -2747,15 +2661,17 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q5[6], r6
 ; CHECK-NEXT:    vmov.u8 r6, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r6
-; CHECK-NEXT:    vmov.s8 r3, q0[8]
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r2, q1[8]
 ; CHECK-NEXT:    vpsel q2, q3, q2
+; CHECK-NEXT:    vmov.s8 r3, q0[8]
 ; CHECK-NEXT:    vmov.u16 r6, q2[2]
 ; CHECK-NEXT:    vmov.u16 r5, q2[0]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
 ; CHECK-NEXT:    vmov.u16 r6, q2[3]
 ; CHECK-NEXT:    vmov.u16 r5, q2[1]
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r6, p0
@@ -2771,47 +2687,43 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r5
 ; CHECK-NEXT:    vmov q4[3], q4[1], r3, r4
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r2, r2, lr
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[10]
-; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r5, r4, d7
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
 ; CHECK-NEXT:    ubfx r5, r6, #12, #1
 ; CHECK-NEXT:    ubfx r6, r6, #8, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.s8 r2, q0[10]
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    rsb.w r6, r6, #0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
-; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
 ; CHECK-NEXT:    vmov.s8 r6, q1[11]
 ; CHECK-NEXT:    vmov.s8 r5, q0[11]
+; CHECK-NEXT:    vmov.s8 r4, q1[10]
+; CHECK-NEXT:    vmov.s8 r2, q0[10]
 ; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    smull r2, r4, r2, r4
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r6
 ; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    adc.w r6, r12, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adds.w r12, r3, r2
-; CHECK-NEXT:    vmov.s8 r2, q1[12]
-; CHECK-NEXT:    adc.w lr, r6, r5
+; CHECK-NEXT:    vmov r2, r6, d6
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r6
+; CHECK-NEXT:    vmov r6, r5, d7
+; CHECK-NEXT:    adds.w r12, r2, r6
 ; CHECK-NEXT:    vmov.u16 r6, q2[6]
+; CHECK-NEXT:    adc.w lr, r3, r5
 ; CHECK-NEXT:    vmov.u16 r5, q2[4]
-; CHECK-NEXT:    vmov.s8 r3, q0[12]
 ; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
 ; CHECK-NEXT:    vmov.u16 r6, q2[7]
 ; CHECK-NEXT:    vmov.u16 r5, q2[5]
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r2, q1[12]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
+; CHECK-NEXT:    vmov.s8 r3, q0[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmrs r6, p0
 ; CHECK-NEXT:    and r4, r6, #1
 ; CHECK-NEXT:    ubfx r5, r6, #4, #1
@@ -2825,39 +2737,35 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r5
 ; CHECK-NEXT:    vmov q3[3], q3[1], r3, r4
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r2, r2, lr
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov.s8 r4, q1[14]
-; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov r2, r3, d4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r5
 ; CHECK-NEXT:    ubfx r5, r6, #12, #1
 ; CHECK-NEXT:    ubfx r6, r6, #8, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.s8 r2, q0[14]
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    rsb.w r6, r6, #0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r6, r5
-; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov q2[3], q2[1], r6, r5
 ; CHECK-NEXT:    vmov.s8 r6, q1[15]
 ; CHECK-NEXT:    vmov.s8 r5, q0[15]
+; CHECK-NEXT:    vmov.s8 r4, q1[14]
+; CHECK-NEXT:    vmov.s8 r2, q0[14]
 ; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    smull r2, r4, r2, r4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r6
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    adcs r2, r6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, r6, d0
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r6
+; CHECK-NEXT:    vmov r6, r5, d1
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
@@ -2901,14 +2809,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -2953,14 +2859,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -2976,29 +2880,23 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
 ; CHECK-LABEL: add_v2i64_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    umull r12, lr, r3, r2
-; CHECK-NEXT:    umull r6, r7, r5, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r12
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    mla r3, r3, r6, lr
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    mla r2, r6, r2, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    mla r3, r5, r3, r7
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    mla r3, r7, r4, r3
-; CHECK-NEXT:    vmov r7, s8
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    vmov r2, r12, d3
+; CHECK-NEXT:    vmov r3, lr, d1
+; CHECK-NEXT:    vmov r6, r9, d2
+; CHECK-NEXT:    vmov r5, r11, d0
+; CHECK-NEXT:    umull r10, r8, r3, r2
+; CHECK-NEXT:    umull r4, r7, r5, r6
+; CHECK-NEXT:    mla r3, r3, r12, r8
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r10
+; CHECK-NEXT:    mla r2, lr, r2, r3
+; CHECK-NEXT:    mla r3, r5, r9, r7
+; CHECK-NEXT:    mla r3, r11, r6, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov r2, r3, d5
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmov r3, r7, d4
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r2, ne
@@ -3006,18 +2904,16 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x
 ; CHECK-NEXT:    cset r3, eq
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r7, s2
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r7, r7, r6
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    adds r0, r0, r7
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r7, r6, d0
+; CHECK-NEXT:    adds r2, r2, r7
+; CHECK-NEXT:    adcs r3, r6
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
index 2129d8efc3f89..97931d88d1a14 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
@@ -16,11 +16,9 @@ entry:
 define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: mul_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -33,11 +31,9 @@ define arm_aapcs_vfpcc i32 @mul_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: mul_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -49,11 +45,9 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: mul_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -170,14 +164,14 @@ entry:
 define arm_aapcs_vfpcc i64 @mul_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: mul_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    umull r0, r12, r2, r1
-; CHECK-NEXT:    mla r2, r2, r3, r12
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    mla r1, r3, r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r1, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    umull r0, r2, r3, r1
+; CHECK-NEXT:    mla r2, r3, r12, r2
+; CHECK-NEXT:    mla r1, lr, r1, r2
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   ret i64 %z
@@ -186,26 +180,22 @@ entry:
 define arm_aapcs_vfpcc i64 @mul_v4i64(<4 x i64> %x) {
 ; CHECK-LABEL: mul_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vmov lr, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r6, s6
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    umull r3, r12, r2, lr
-; CHECK-NEXT:    umull r4, r8, r3, r1
-; CHECK-NEXT:    umull r0, r7, r4, r6
-; CHECK-NEXT:    mla r4, r4, r5, r7
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    mla r3, r3, r5, r8
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    mla r2, r2, r5, r12
-; CHECK-NEXT:    mla r2, r7, lr, r2
-; CHECK-NEXT:    mla r1, r2, r1, r3
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    vmov r1, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    vmov r5, r9, d2
+; CHECK-NEXT:    vmov r6, r11, d3
+; CHECK-NEXT:    umull r2, r8, r3, r1
+; CHECK-NEXT:    mla r3, r3, r12, r8
+; CHECK-NEXT:    umull r7, r10, r2, r5
+; CHECK-NEXT:    mla r1, lr, r1, r3
+; CHECK-NEXT:    mla r2, r2, r9, r10
+; CHECK-NEXT:    umull r0, r4, r7, r6
+; CHECK-NEXT:    mla r1, r1, r5, r2
+; CHECK-NEXT:    mla r4, r7, r11, r4
 ; CHECK-NEXT:    mla r1, r1, r6, r4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   ret i64 %z
@@ -228,11 +218,9 @@ entry:
 define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: mul_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
@@ -247,11 +235,9 @@ define arm_aapcs_vfpcc i32 @mul_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: mul_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
@@ -265,11 +251,9 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: mul_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
@@ -405,20 +389,18 @@ entry:
 define arm_aapcs_vfpcc i64 @mul_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-LABEL: mul_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    umull r12, lr, r3, r2
-; CHECK-NEXT:    mla r3, r3, r4, lr
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    mla r3, r4, r2, r3
-; CHECK-NEXT:    umull r2, r4, r0, r12
-; CHECK-NEXT:    mla r0, r0, r3, r4
-; CHECK-NEXT:    mla r1, r1, r12, r0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    umull r4, r5, r3, r2
+; CHECK-NEXT:    mla r3, r3, r12, r5
+; CHECK-NEXT:    mla r3, lr, r2, r3
+; CHECK-NEXT:    umull r2, r5, r0, r4
+; CHECK-NEXT:    mla r0, r0, r3, r5
+; CHECK-NEXT:    mla r1, r1, r4, r0
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   %r = mul i64 %y, %z
@@ -428,30 +410,36 @@ entry:
 define arm_aapcs_vfpcc i64 @mul_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-LABEL: mul_v4i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    vmov r12, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r7, s6
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    umull r2, lr, r3, r12
-; CHECK-NEXT:    umull r5, r8, r2, r4
-; CHECK-NEXT:    umull r10, r9, r5, r7
-; CHECK-NEXT:    mla r5, r5, r6, r9
-; CHECK-NEXT:    vmov r6, s5
-; CHECK-NEXT:    mla r2, r2, r6, r8
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    mla r3, r3, r6, lr
-; CHECK-NEXT:    vmov r6, s1
-; CHECK-NEXT:    mla r3, r6, r12, r3
-; CHECK-NEXT:    mla r2, r3, r4, r2
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, #12
+; CHECK-NEXT:    mov lr, r0
+; CHECK-NEXT:    vmov r2, r0, d1
+; CHECK-NEXT:    vmov r6, r9, d2
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    vmov r7, r11, d3
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vmov r3, r0, d0
+; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    umull r4, r8, r3, r2
+; CHECK-NEXT:    mla r3, r3, r1, r8
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    umull r5, r10, r4, r6
+; CHECK-NEXT:    mla r2, r1, r2, r3
+; CHECK-NEXT:    mla r4, r4, r9, r10
+; CHECK-NEXT:    umull r0, r12, r5, r7
+; CHECK-NEXT:    mla r2, r2, r6, r4
+; CHECK-NEXT:    mla r5, r5, r11, r12
 ; CHECK-NEXT:    mla r3, r2, r7, r5
-; CHECK-NEXT:    umull r2, r7, r0, r10
-; CHECK-NEXT:    mla r0, r0, r3, r7
-; CHECK-NEXT:    mla r1, r1, r10, r0
+; CHECK-NEXT:    umull r2, r7, lr, r0
+; CHECK-NEXT:    mla r1, lr, r3, r7
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mla r1, r3, r0, r1
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #12
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   %r = mul i64 %y, %z

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
index ff3b46aa95d00..6bd3ee578b89c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
@@ -66,32 +66,28 @@ entry:
 define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-LABEL: vld2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
-; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r12, s7
-; CHECK-NEXT:    vmov lr, s3
-; CHECK-NEXT:    adds r6, r3, r2
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adc.w r12, r12, lr
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov r4, r7, d4
+; CHECK-NEXT:    vmov r2, r5, d0
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r3, r6, d1
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r6, r6, r12
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r7, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %l1 = load <4 x i64>, <4 x i64>* %src, align 4
   %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index e21d3d798d359..8b139f8d97d2d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -319,32 +319,28 @@ entry:
 define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-LABEL: vld2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r12, s7
-; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vmov r5, r6, d0
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r3, r2, d1
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    adcs r6, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <4 x i64>, <4 x i64>* %src, align 8
   %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
@@ -357,58 +353,50 @@ entry:
 define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld2_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s22
-; CHECK-NEXT:    vmov.f32 s2, s20
-; CHECK-NEXT:    vmov.f32 s11, s23
-; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s6, s16
-; CHECK-NEXT:    vmov.f32 s7, s17
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    adcs r0, r2
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    adds r4, r4, r6
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, lr
-; CHECK-NEXT:    adcs r0, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s18, s14
+; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov.f32 s19, s15
+; CHECK-NEXT:    vmov.f32 s11, s13
+; CHECK-NEXT:    vmov r0, r7, d8
+; CHECK-NEXT:    vmov r5, r6, d4
+; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, r4, d9
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    adc.w r8, r6, r7
+; CHECK-NEXT:    vmov r6, r5, d5
+; CHECK-NEXT:    vmov r2, r7, d0
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    adc.w r6, r5, r4
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r8, r6
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    adc.w r0, r7, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 8
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -576,8 +564,7 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vins.f16 s5, s8
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 06dbfe8debbbc..423f796e97753 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -6,20 +6,21 @@
 define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
 ; CHECK-LABEL: vld3_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r12, r3, [r0, #16]
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
 ; CHECK-NEXT:    vmov.f64 d2, d0
 ; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov r12, lr, d0
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    add.w r3, r12, lr
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    add r2, r12
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    strd r2, r0, [r1]
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %l1 = load <6 x i32>, <6 x i32>* %src, align 4
   %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -250,30 +251,30 @@ entry:
 define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-LABEL: vld3_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.u16 r3, q0[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
+; CHECK-NEXT:    vldrh.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r5, q0[6]
+; CHECK-NEXT:    vmov.u16 r6, q0[0]
+; CHECK-NEXT:    vmov r0, r3, d2
+; CHECK-NEXT:    vmov.u16 lr, q0[2]
+; CHECK-NEXT:    vmov r2, r4, d3
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r5
+; CHECK-NEXT:    vmov.u16 r5, q0[7]
+; CHECK-NEXT:    vmov.u16 r6, q0[1]
+; CHECK-NEXT:    vmov q2[2], q2[0], r6, r5
+; CHECK-NEXT:    vmov.u16 r5, q0[3]
+; CHECK-NEXT:    vmov.u16 r6, q0[4]
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r6, r2
+; CHECK-NEXT:    vmov.u16 r12, q0[5]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q2
+; CHECK-NEXT:    vmov q1[2], q1[0], lr, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <12 x i16>, <12 x i16>* %src, align 4
   %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -747,48 +748,37 @@ entry:
 define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-LABEL: vld3_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov.f32 s15, s17
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vmov.f32 s1, s9
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov r12, s15
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, lr, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov r5, r8, d6
+; CHECK-NEXT:    vmov r6, r7, d0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    vmov lr, r12, d7
+; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    adds.w r0, r0, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r3, r4
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    adds r6, r6, r5
+; CHECK-NEXT:    adc.w r7, r7, r8
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    adcs r7, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <6 x i64>, <6 x i64>* %src, align 4
   %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
@@ -805,86 +795,65 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q5, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s9
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f64 d8, d5
-; CHECK-NEXT:    vmov.f32 s17, s11
-; CHECK-NEXT:    vmov.f32 s18, s4
-; CHECK-NEXT:    vmov.f32 s19, s5
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s25, s23
-; CHECK-NEXT:    vmov.f32 s26, s4
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vmov.f32 s27, s5
-; CHECK-NEXT:    vmov.f32 s23, s15
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.f64 d14, d6
-; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    vmov.f32 s29, s13
-; CHECK-NEXT:    vmov.f32 s30, s6
-; CHECK-NEXT:    vmov.f32 s31, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov r6, s8
-; CHECK-NEXT:    vmov r7, s24
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmov.f32 s7, s13
+; CHECK-NEXT:    vmov.f32 s11, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d10, d7
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r5, r4, d1
+; CHECK-NEXT:    vmov r3, r8, d5
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s22, s24
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov r6, r7, d10
+; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    adds.w r0, r5, lr
+; CHECK-NEXT:    adc.w r5, r4, r12
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    adc.w r3, r2, r12
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    adds.w lr, lr, r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adc.w r8, r2, r3
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s21
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r0
-; CHECK-NEXT:    vmov r0, s29
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    vmov r4, r2, d6
+; CHECK-NEXT:    adc.w r12, r5, r8
+; CHECK-NEXT:    vmov r5, r0, d8
+; CHECK-NEXT:    adds r6, r6, r4
+; CHECK-NEXT:    adcs r2, r7
+; CHECK-NEXT:    adds r6, r6, r5
+; CHECK-NEXT:    adc.w r8, r2, r0
+; CHECK-NEXT:    vmov r7, r4, d11
+; CHECK-NEXT:    vmov r2, r5, d7
+; CHECK-NEXT:    vmov r3, r0, d0
 ; CHECK-NEXT:    adds r2, r2, r7
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r8
+; CHECK-NEXT:    adc.w r7, r5, r4
+; CHECK-NEXT:    vmov r5, r4, d9
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    adcs r7, r4
+; CHECK-NEXT:    vmov r5, r4, d2
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r8, r7
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    adcs r4, r6
-; CHECK-NEXT:    vmov r6, s28
-; CHECK-NEXT:    adds r2, r2, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    adds r3, r3, r5
 ; CHECK-NEXT:    adcs r0, r4
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
+; CHECK-NEXT:    adcs r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <12 x i64>, <12 x i64>* %src, align 4
@@ -1133,8 +1102,8 @@ entry:
 define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
 ; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.32 q2[1], r3
@@ -1144,11 +1113,11 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmovx.f16 s8, s8
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmovx.f16 s16, s5
 ; CHECK-NEXT:    vins.f16 s12, s6
-; CHECK-NEXT:    vmovx.f16 s18, s5
+; CHECK-NEXT:    vins.f16 s4, s16
+; CHECK-NEXT:    vmovx.f16 s16, s6
 ; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vins.f16 s4, s18
 ; CHECK-NEXT:    vmovx.f16 s13, s7
 ; CHECK-NEXT:    vins.f16 s7, s8
 ; CHECK-NEXT:    vmov.f32 s0, s5
@@ -1156,10 +1125,9 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vadd.f16 q1, q1, q3
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x half>, <12 x half>* %src, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index 8163e550b6f80..06c10e0b7bb1a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -102,62 +102,51 @@ entry:
 define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-LABEL: vld4_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #64
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #-48]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #-16]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov.f64 d8, d7
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r12, s19
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    vmov r7, s0
-; CHECK-NEXT:    adds r6, r3, r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adc.w r12, r12, lr
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    adds.w lr, r5, r6
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r6, s17
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adcs r6, r5
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    adds r3, r3, r7
-; CHECK-NEXT:    adcs r4, r5
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r4, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #64
+; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r2, r7, d1
+; CHECK-NEXT:    vmov r4, r8, d7
+; CHECK-NEXT:    vmov r3, r6, d5
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r7, r7, r12
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    adc.w r6, r6, r8
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    vmov r3, r2, d0
+; CHECK-NEXT:    adc.w lr, r6, r7
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r6, r4, d6
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    vmov r5, r7, d4
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    adcs r4, r7
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, lr
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 4
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index aa5d933562bc1..0d574f954e3cd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -644,59 +644,51 @@ entry:
 define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-LABEL: vld4_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d7
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov r12, s19
-; CHECK-NEXT:    vmov r2, s15
 ; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    adcs r0, r3
-; CHECK-NEXT:    adds.w lr, lr, r2
-; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r4, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r0, r8, d7
+; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    adds.w r2, r2, lr
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov r4, r12, d2
 ; CHECK-NEXT:    adds r0, r0, r5
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov r5, r7, d0
+; CHECK-NEXT:    adc.w r6, r6, r8
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w lr, r6, r3
+; CHECK-NEXT:    vmov r3, r6, d6
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov r4, r2, d4
+; CHECK-NEXT:    adc.w r7, r7, r12
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r6
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r2, r7
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, lr
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 8
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
@@ -717,112 +709,90 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT:    vmov.f64 d14, d9
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s29, s19
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vmov.f64 d4, d13
-; CHECK-NEXT:    vmov.f32 s31, s3
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s9, s27
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s26, s0
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s27, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vmov.f64 d10, d7
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
+; CHECK-NEXT:    vmov.f64 d4, d11
+; CHECK-NEXT:    vmov.f32 s9, s23
+; CHECK-NEXT:    vmov r3, r2, d7
+; CHECK-NEXT:    vmov r4, r5, d3
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vmov.f32 s11, s19
+; CHECK-NEXT:    vmov.f32 s22, s16
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vmov q7, q5
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vmov r0, r6, d15
+; CHECK-NEXT:    vmov.f64 d14, d11
+; CHECK-NEXT:    vmov.f32 s29, s23
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov.f32 s30, s26
+; CHECK-NEXT:    vmov.f32 s22, s24
+; CHECK-NEXT:    vmov.f32 s31, s27
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vmov.f64 d12, d9
+; CHECK-NEXT:    adds r7, r4, r3
+; CHECK-NEXT:    adcs r5, r2
+; CHECK-NEXT:    vmov r4, r8, d14
+; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov.f32 s25, s19
+; CHECK-NEXT:    vmov.f32 s26, s2
 ; CHECK-NEXT:    vmov.f32 s18, s0
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vmov.f32 s19, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s28
-; CHECK-NEXT:    adcs r0, r2
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s29
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    adds.w r0, r0, lr
+; CHECK-NEXT:    adc.w r6, r6, r12
+; CHECK-NEXT:    adds.w lr, r0, r7
+; CHECK-NEXT:    adc.w r12, r6, r5
+; CHECK-NEXT:    vmov r6, r5, d12
 ; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov r7, s6
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s18
-; CHECK-NEXT:    adcs r4, r0
-; CHECK-NEXT:    adds.w r9, r5, r2
-; CHECK-NEXT:    vmov r5, s30
-; CHECK-NEXT:    adc.w r8, r4, r3
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    adds r3, r3, r7
-; CHECK-NEXT:    vmov r7, s12
-; CHECK-NEXT:    adcs r4, r6
+; CHECK-NEXT:    vmov r4, r0, d8
+; CHECK-NEXT:    adc.w r3, r3, r8
+; CHECK-NEXT:    adds r6, r6, r4
+; CHECK-NEXT:    adcs r0, r5
+; CHECK-NEXT:    adds.w r9, r6, r2
+; CHECK-NEXT:    adc.w r8, r0, r3
+; CHECK-NEXT:    vmov r5, r4, d15
+; CHECK-NEXT:    vmov r3, r6, d11
+; CHECK-NEXT:    vmov r7, r0, d9
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r6, r4
+; CHECK-NEXT:    vmov r5, r4, d13
+; CHECK-NEXT:    adds r5, r5, r7
+; CHECK-NEXT:    adcs r0, r4
 ; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    vmov r6, s20
-; CHECK-NEXT:    adc.w r10, r4, r2
-; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    adc.w r10, r0, r6
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    vmov r6, r7, d0
+; CHECK-NEXT:    vmov r2, r0, d2
 ; CHECK-NEXT:    vmov q1[2], q1[0], r9, r3
-; CHECK-NEXT:    vmov r5, s13
 ; CHECK-NEXT:    vmov q1[3], q1[1], r8, r10
-; CHECK-NEXT:    vmov r2, s24
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    adds r6, r6, r7
-; CHECK-NEXT:    vmov r7, s25
-; CHECK-NEXT:    adcs r4, r5
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r7, r5
-; CHECK-NEXT:    adds r0, r0, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    adc.w r0, r4, r2
+; CHECK-NEXT:    adds r4, r4, r6
+; CHECK-NEXT:    adcs r5, r7
+; CHECK-NEXT:    vmov r6, r7, d6
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    adcs r0, r7
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    adcs r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
@@ -1101,31 +1071,30 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vldrh.u16 q1, [r0, #16]
-; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s12
-; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vldrh.u16 q2, [r0, #16]
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s16, s3
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vmovx.f16 s5, s8
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vmovx.f16 s13, s4
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmovx.f16 s13, s9
+; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s9, s11
 ; CHECK-NEXT:    vmov.f32 s16, s1
-; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vmov.f32 s17, s5
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s17, s9
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vadd.f16 q3, q4, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q3
-; CHECK-NEXT:    vadd.f16 q2, q4, q2
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll
index a581734794b7d..9d6978a0885a4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll
@@ -495,19 +495,16 @@ define arm_aapcs_vfpcc i64 @uminv2i64(<2 x i64> %vec, i64 %min) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r4, r3, r2, lo
 ; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r4, r2, r3, lo
-; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    csel r2, r2, r3, lo
-; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r5, r2, r4, eq
+; CHECK-NEXT:    csel r2, r3, r2, lo
 ; CHECK-NEXT:    csel r3, lr, r12, lo
+; CHECK-NEXT:    csel r5, r4, r2, eq
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    subs r2, r5, r0
-; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    sbcs.w r2, r3, r1
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r4, #1
@@ -526,19 +523,16 @@ define arm_aapcs_vfpcc i64 @sminv2i64(<2 x i64> %vec, i64 %min) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r4, r2, r3, lt
-; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    csel r2, r2, r3, lo
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r4, r3, r2, lo
 ; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r5, r2, r4, eq
+; CHECK-NEXT:    csel r2, r3, r2, lt
 ; CHECK-NEXT:    csel r3, lr, r12, lt
+; CHECK-NEXT:    csel r5, r4, r2, eq
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    subs r2, r5, r0
-; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    sbcs.w r2, r3, r1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
@@ -557,19 +551,16 @@ define arm_aapcs_vfpcc i64 @umaxv2i64(<2 x i64> %vec, i64 %max) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r4, r3, r2, hi
 ; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r4, r2, r3, hi
-; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    csel r2, r2, r3, hi
-; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r5, r2, r4, eq
+; CHECK-NEXT:    csel r2, r3, r2, hi
 ; CHECK-NEXT:    csel r3, lr, r12, hi
+; CHECK-NEXT:    csel r5, r4, r2, eq
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    subs r2, r0, r5
-; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    sbcs.w r2, r1, r3
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r4, #1
@@ -588,19 +579,16 @@ define arm_aapcs_vfpcc i64 @smaxv2i64(<2 x i64> %vec, i64 %max) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r4, r2, r3, gt
-; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    csel r2, r2, r3, hi
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    vmov r3, lr, d0
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r4, r3, r2, hi
 ; CHECK-NEXT:    cmp lr, r12
-; CHECK-NEXT:    csel r5, r2, r4, eq
+; CHECK-NEXT:    csel r2, r3, r2, gt
 ; CHECK-NEXT:    csel r3, lr, r12, gt
+; CHECK-NEXT:    csel r5, r4, r2, eq
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    subs r2, r0, r5
-; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    sbcs.w r2, r1, r3
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index a2b7fad320c58..9b0bc7e72516c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -876,12 +876,11 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn32_badlanes(<4 x i32> %src1) {
 ; CHECK-LABEL: vmovn32_badlanes:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.16 q1[5], r0
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q1[3], r1
+; CHECK-NEXT:    vmov.16 q1[5], r1
 ; CHECK-NEXT:    vmov.16 q1[7], r0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -889,12 +888,11 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_badlanes(<4 x i32> %src1) {
 ; CHECKBE-LABEL: vmovn32_badlanes:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vrev64.32 q1, q0
-; CHECKBE-NEXT:    vmov r0, s4
+; CHECKBE-NEXT:    vmov r0, r1, d2
 ; CHECKBE-NEXT:    vmov.16 q2[1], r0
-; CHECKBE-NEXT:    vmov r0, s5
-; CHECKBE-NEXT:    vmov.16 q2[3], r0
-; CHECKBE-NEXT:    vmov.16 q2[5], r0
 ; CHECKBE-NEXT:    vmov r0, s6
+; CHECKBE-NEXT:    vmov.16 q2[3], r1
+; CHECKBE-NEXT:    vmov.16 q2[5], r1
 ; CHECKBE-NEXT:    vmov.16 q2[7], r0
 ; CHECKBE-NEXT:    vrev64.16 q0, q2
 ; CHECKBE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
index 411e90152e0e8..c59fbeaeb46d3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
@@ -15,18 +15,14 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
 ; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmullb.s32 q2, q1, q0
 ; CHECK-NEXT:    vmullt.s32 q3, q1, q0
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    vmov r12, s10
+; CHECK-NEXT:    vmov r12, r5, d5
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r5, s9
+; CHECK-NEXT:    vmov r4, r5, d4
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    vmov q2[2], q2[0], r4, r12
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmov r12, r5, d7
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov r4, r5, d6
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r12
 ; CHECK-NEXT:    vstrb.8 q2, [r2], #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index 1eee60b1003b5..21829964c2abf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -85,22 +85,20 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q2[4]
 ; CHECK-NEXT:    vmullb.s16 q0, q3, q0
 ; CHECK-NEXT:    vmov.i32 q3, #0x7fff
 ; CHECK-NEXT:    vshl.i32 q0, q0, #10
 ; CHECK-NEXT:    vshr.s32 q0, q0, #10
 ; CHECK-NEXT:    vshr.s32 q0, q0, #15
 ; CHECK-NEXT:    vmin.s32 q4, q0, q3
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov.16 q0[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[4]
 ; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.u16 r1, q2[5]
@@ -116,14 +114,12 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vshr.s32 q1, q1, #10
 ; CHECK-NEXT:    vshr.s32 q1, q1, #15
 ; CHECK-NEXT:    vmin.s32 q1, q1, q3
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
index c7854e7061f3f..48209bdcc0179 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
@@ -164,22 +164,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    subs.w r2, r2, r12
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    subs.w r1, r1, r12
+; CHECK-NEXT:    sbcs r1, r2, #0
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs.w r3, r3, r12
+; CHECK-NEXT:    subs.w r2, r2, r12
 ; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #1
@@ -192,19 +190,17 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    sbcs.w r1, r12, r1
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r12, r2
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r2, r12, r2
+; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
+; CHECK-NEXT:    sbcs.w r2, r12, r3
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
@@ -240,21 +236,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    sbcs.w r1, r12, r1
+; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r12, r2
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r2, r12, r2
+; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
+; CHECK-NEXT:    sbcs.w r2, r12, r3
 ; CHECK-NEXT:    mvn r12, #-2147483648
 ; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lt
@@ -268,19 +262,17 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    subs.w r2, r2, r12
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    subs.w r1, r1, r12
+; CHECK-NEXT:    sbcs r1, r2, #0
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs.w r3, r3, r12
-; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    subs.w r2, r2, r12
+; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
@@ -316,21 +308,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_umaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs.w r3, r3, #-1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
@@ -350,21 +340,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_uminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs.w r3, r3, #-1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
index 6c64285c5fd83..e1205e7d3afed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
@@ -180,56 +180,52 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    mov.w lr, #0
-; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    subs.w r3, r2, r12
-; CHECK-NEXT:    sbcs r3, r1, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    mvn lr, #-2147483648
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    asrl r2, r3, #3
+; CHECK-NEXT:    asrl r0, r1, #3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    subs.w r2, r2, lr
+; CHECK-NEXT:    sbcs r2, r3, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    asrl r4, r3, #3
-; CHECK-NEXT:    subs.w r0, r4, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    sbcs r0, r3, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    subs.w r0, r0, lr
+; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r5
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
 ; CHECK-NEXT:    adr r0, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    mov.w r2, #-1
+; CHECK-NEXT:    vbic q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbcs.w r0, r2, r0
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
+; CHECK-NEXT:    sbcs.w r0, r2, r1
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r2, r1
+; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r2, r3
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w lr, #1
-; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
@@ -238,7 +234,7 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) {
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI12_0:
@@ -265,19 +261,17 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r2, r1, d1
 ; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    mov.w lr, #0
 ; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    mov.w lr, #0
 ; CHECK-NEXT:    rsbs.w r3, r2, #-2147483648
 ; CHECK-NEXT:    sbcs.w r3, r12, r1
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r4, r3, d0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    asrl r4, r3, #3
 ; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
@@ -297,19 +291,17 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) {
 ; CHECK-NEXT:    vbic q1, q1, q0
 ; CHECK-NEXT:    vand q0, q2, q0
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    subs r1, r1, r2
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    vmov r1, r3, d0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    subs r1, r1, r2
+; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w lr, #1
 ; CHECK-NEXT:    cmp.w lr, #0
@@ -346,37 +338,33 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_umaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_umaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
-; CHECK-NEXT:    lsrl r0, r5, #3
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    subs.w r3, r0, #-1
-; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vmov r2, r1, d0
+; CHECK-NEXT:    lsrl r0, r3, #3
+; CHECK-NEXT:    lsrl r2, r1, #3
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    sbcs r0, r3, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    csetm r12, ne
-; CHECK-NEXT:    lsrl r4, r3, #3
-; CHECK-NEXT:    subs.w r1, r4, #-1
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
-; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    subs.w r2, r2, #-1
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
+; CHECK-NEXT:    movlo.w r12, #1
+; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vbic q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c1 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>
@@ -387,37 +375,33 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_uminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_uminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
-; CHECK-NEXT:    lsrl r0, r5, #3
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    subs.w r3, r0, #-1
-; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    vmov r0, r3, d1
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vmov r2, r1, d0
+; CHECK-NEXT:    lsrl r0, r3, #3
+; CHECK-NEXT:    lsrl r2, r1, #3
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    sbcs r0, r3, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    csetm r12, ne
-; CHECK-NEXT:    lsrl r4, r3, #3
-; CHECK-NEXT:    subs.w r1, r4, #-1
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
-; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    subs.w r2, r2, #-1
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
+; CHECK-NEXT:    movlo.w r12, #1
+; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vbic q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c2 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index bd9b8a0bfa151..8529787fa2560 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -520,9 +520,8 @@ define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vins.f16 s4, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vins.f16 s5, s0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    str r0, [r1, #4]
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, r2, d2
+; CHECK-NEXT:    str r2, [r1, #4]
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index f569ddb2de912..c1827f0c91886 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -301,35 +301,31 @@ entry:
 define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
 ; CHECK-LABEL: vst3_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrh.u32 q1, [r0]
-; CHECK-NEXT:    vldrh.u32 q3, [r0, #8]
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.f64 d0, d5
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.f32 s2, s15
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
-; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vldrh.u32 q1, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov r0, r5, d2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov lr, r4, d1
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.f32 s15, s11
+; CHECK-NEXT:    vmov.16 q0[1], r2
+; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r12, s6
+; CHECK-NEXT:    vmov.16 q0[3], r5
+; CHECK-NEXT:    vstrh.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.16 q0[4], r3
+; CHECK-NEXT:    vmov.16 q0[5], r4
+; CHECK-NEXT:    vmov.16 q0[6], r12
+; CHECK-NEXT:    vmov.16 q0[7], lr
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
   %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
@@ -621,40 +617,33 @@ entry:
 define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) {
 ; CHECK-LABEL: vst3_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
 ; CHECK-NEXT:    vldrb.u32 q1, [r0]
-; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
-; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.8 q4[8], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.8 q4[9], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.8 q4[10], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.8 q4[11], r0
-; CHECK-NEXT:    vstrb.16 q0, [r1]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    str r0, [r1, #8]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov r2, lr, d0
+; CHECK-NEXT:    vmov r12, r3, d1
+; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
+; CHECK-NEXT:    vmov r0, r6, d3
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov.8 q2[8], r4
+; CHECK-NEXT:    vmov.8 q2[9], r6
+; CHECK-NEXT:    vmov.8 q2[10], r3
+; CHECK-NEXT:    vmov.8 q2[11], r5
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    str r3, [r1, #8]
+; CHECK-NEXT:    vmov r3, r4, d2
+; CHECK-NEXT:    vmov.16 q1[0], r3
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.16 q1[2], r3
+; CHECK-NEXT:    vmov.16 q1[3], r4
+; CHECK-NEXT:    vmov.16 q1[4], lr
+; CHECK-NEXT:    vmov.16 q1[5], r5
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.16 q1[7], r12
+; CHECK-NEXT:    vstrb.16 q1, [r1]
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
   %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
@@ -1313,11 +1302,9 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
 ; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vins.f16 s2, s10
 ; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    str r0, [r1, #8]
-; CHECK-NEXT:    strd r3, r2, [r1]
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, r2, d0
+; CHECK-NEXT:    stm r1!, {r0, r2, r3}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -1365,11 +1352,10 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vins.f16 s5, s10
 ; CHECK-NEXT:    vins.f16 s17, s12
 ; CHECK-NEXT:    vmov.f32 s16, s5
-; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s3, s8
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r0, r2, d8
 ; CHECK-NEXT:    strd r0, r2, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 5e9eaed161da1..eb6af2c017dc4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -203,40 +203,40 @@ entry:
 define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-LABEL: vst4_v4i32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vdup.32 q4, r0
-; CHECK-NEXT:    vmov.f32 s1, s5
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vdup.32 q4, r0
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.f32 s16, s8
-; CHECK-NEXT:    vdup.32 q6, r0
-; CHECK-NEXT:    vmov.f32 s20, s11
-; CHECK-NEXT:    vmov.f32 s8, s10
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r12, lr, d0
+; CHECK-NEXT:    vdup.32 q4, r3
+; CHECK-NEXT:    vmov.f64 d0, d6
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f64 d4, d7
+; CHECK-NEXT:    vmov.f32 s12, s15
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vstrb.8 q3, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s9, s6
-; CHECK-NEXT:    vdup.32 q1, r0
-; CHECK-NEXT:    vmov.f32 s22, s26
-; CHECK-NEXT:    vstrb.8 q4, [r1]
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vstrb.8 q5, [r1, #48]
+; CHECK-NEXT:    vdup.32 q3, r2
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmov.f32 s11, s15
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #32]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vdup.32 q2, lr
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vstrb.8 q1, [r1, #16]
+; CHECK-NEXT:    vdup.32 q1, r12
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
   %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
@@ -297,43 +297,36 @@ entry:
 define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vst4_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrh.u32 q1, [r0]
-; CHECK-NEXT:    vldrh.u32 q2, [r0, #8]
-; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov lr, r12, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov.16 q1[0], r4
+; CHECK-NEXT:    vmov r0, r4, d5
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov r0, r2, d4
+; CHECK-NEXT:    vmov.16 q1[4], r5
+; CHECK-NEXT:    vmov.16 q1[5], r3
+; CHECK-NEXT:    vmov r3, r5, d0
+; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.16 q1[6], r4
+; CHECK-NEXT:    vmov.16 q0[1], lr
+; CHECK-NEXT:    vmov.16 q1[7], r4
 ; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vstrh.16 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vstrh.16 q0, [r1, #16]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vstrh.16 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vmov.16 q0[4], r5
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
   %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
@@ -531,39 +524,35 @@ entry:
 define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vst4_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrb.u32 q1, [r0]
-; CHECK-NEXT:    vldrb.u32 q2, [r0, #4]
-; CHECK-NEXT:    vldrb.u32 q3, [r0, #8]
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.8 q0[0], r2
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
+; CHECK-NEXT:    vldrb.u32 q2, [r0, #8]
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov.8 q0[0], r4
+; CHECK-NEXT:    vmov r0, r4, d4
 ; CHECK-NEXT:    vmov.8 q0[1], r2
-; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.8 q0[2], r0
 ; CHECK-NEXT:    vmov.8 q0[3], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.8 q0[4], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.8 q0[5], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.8 q0[6], r0
-; CHECK-NEXT:    vmov.8 q0[7], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.8 q0[8], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.8 q0[9], r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r0, r2, d5
+; CHECK-NEXT:    vmov.8 q0[4], r5
+; CHECK-NEXT:    vmov.8 q0[5], r3
+; CHECK-NEXT:    vmov r3, r5, d3
+; CHECK-NEXT:    vmov.8 q0[6], r4
+; CHECK-NEXT:    vmov.8 q0[7], r4
+; CHECK-NEXT:    vmov.8 q0[8], r3
+; CHECK-NEXT:    vmov.8 q0[9], lr
 ; CHECK-NEXT:    vmov.8 q0[10], r0
 ; CHECK-NEXT:    vmov.8 q0[11], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.8 q0[12], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.8 q0[13], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.8 q0[14], r0
-; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov.8 q0[12], r5
+; CHECK-NEXT:    vmov.8 q0[13], r12
+; CHECK-NEXT:    vmov.8 q0[14], r2
+; CHECK-NEXT:    vmov.8 q0[15], r2
 ; CHECK-NEXT:    vstrb.8 q0, [r1]
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
   %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
@@ -984,33 +973,40 @@ entry:
 define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) {
 ; CHECK-LABEL: vst4_v4f32_align1:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s4, s13
-; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vmov.f32 s20, s15
-; CHECK-NEXT:    vmov.f32 s5, s9
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vstrb.8 q1, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s17, s8
-; CHECK-NEXT:    vmov.f32 s18, s0
-; CHECK-NEXT:    vmov.f32 s19, s0
-; CHECK-NEXT:    vmov.f32 s23, s3
-; CHECK-NEXT:    vstrb.8 q4, [r1]
-; CHECK-NEXT:    vmov.f32 s0, s14
-; CHECK-NEXT:    vstrb.8 q5, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s1, s10
-; CHECK-NEXT:    vmov.f32 s3, s2
-; CHECK-NEXT:    vstrb.8 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d2, d8
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vdup.32 q5, r3
+; CHECK-NEXT:    vmov.f32 s8, s17
+; CHECK-NEXT:    vmov.f64 d6, d9
+; CHECK-NEXT:    vmov.f32 s16, s19
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vstrb.8 q4, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s13, s10
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov r12, lr, d0
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s15, s2
+; CHECK-NEXT:    vstrb.8 q3, [r1, #32]
+; CHECK-NEXT:    vdup.32 q3, lr
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmov.f32 s11, s15
+; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vdup.32 q2, r12
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s7, s0
+; CHECK-NEXT:    vstrb.8 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
   %l1 = load <4 x float>, <4 x float>* %s1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
index 9696827d84696..d055469064e52 100644
--- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
@@ -50,10 +50,8 @@ entry:
 define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) {
 ; CHECK-LABEL: foo_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
@@ -62,36 +60,31 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s2, s17
 ; CHECK-NEXT:    vand q6, q0, q5
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov r1, s25
-; CHECK-NEXT:    bl __aeabi_ul2d
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vmov r1, s27
+; CHECK-NEXT:    vmov r0, r1, d13
 ; CHECK-NEXT:    bl __aeabi_ul2d
+; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov.f64 d0, d9
 ; CHECK-NEXT:    vmov.f32 s2, s19
-; CHECK-NEXT:    vand q0, q0, q5
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    vmov d8, r4, r5
+; CHECK-NEXT:    vand q5, q0, q5
+; CHECK-NEXT:    vmov r4, r5, d11
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_ul2d
+; CHECK-NEXT:    vmov d8, r0, r1
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    bl __aeabi_ul2d
+; CHECK-NEXT:    vmov r2, r3, d10
 ; CHECK-NEXT:    vmov d11, r0, r1
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov d10, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vmov q1, q5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)


        


More information about the llvm-commits mailing list