[llvm] e1c1adf - [ARM] Match dual lane vmovs from insert_vector_elt

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 18 08:13:23 PST 2020


Author: David Green
Date: 2020-12-18T16:13:08Z
New Revision: e1c1adf9dc1a6e535ff6a8d5373e968f772e68e1

URL: https://github.com/llvm/llvm-project/commit/e1c1adf9dc1a6e535ff6a8d5373e968f772e68e1
DIFF: https://github.com/llvm/llvm-project/commit/e1c1adf9dc1a6e535ff6a8d5373e968f772e68e1.diff

LOG: [ARM] Match dual lane vmovs from insert_vector_elt

MVE has a dual lane vector move instruction, capable of moving two
general purpose registers into lanes of a vector register. They look
like one of:
  vmov q0[2], q0[0], r2, r0
  vmov q0[3], q0[1], r3, r1
They only accept these lane indices though (and only insert into an
i32), either moving lanes 1 and 3, or 0 and 2.

This patch adds some tablegen patterns for them, selecting from vector
inserts elements. Because the insert_elements are know to be
canonicalized to ascending order there are several patterns that we need
to select. These lane indices are:

3 2 1 0    -> vmovqrr 31; vmovqrr 20
3 2 1      -> vmovqrr 31; vmov 2
3 1        -> vmovqrr 31
2 1 0      -> vmovqrr 20; vmov 1
2 0        -> vmovqrr 20

With the top one being the most common. All other potential patterns of
lane indices will be matched by a combination of these and the
individual vmov pattern already present. This does mean that we are
selecting several machine instructions at once due to the need to
re-arrange the inserts, but in this case there is nothing else that will
attempt to match an insert_vector_elt node.

This is a recommit of 6cc3d80a84884a79967fffa4596c14001b8ba8a3 after
fixing the backward instruction definitions.

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/test/CodeGen/Thumb2/active_lane_mask.ll
    llvm/test/CodeGen/Thumb2/mve-abs.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-neg.ll
    llvm/test/CodeGen/Thumb2/mve-phireg.ll
    llvm/test/CodeGen/Thumb2/mve-pred-and.ll
    llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
    llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
    llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
    llvm/test/CodeGen/Thumb2/mve-pred-not.ll
    llvm/test/CodeGen/Thumb2/mve-pred-or.ll
    llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-sext.ll
    llvm/test/CodeGen/Thumb2/mve-shifts.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vcmp.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
    llvm/test/CodeGen/Thumb2/mve-vcreate.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vdup.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vmulh.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
    llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
    llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll
    llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index d792240c9ffd..2d937930d89f 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4790,6 +4790,14 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     }
   }
+  if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) {
+    assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm());
+    if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) ||
+        MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) {
+      ErrInfo = "Incorrect array index for MVE_VMOV_q_rr";
+      return false;
+    }
+  }
   return true;
 }
 

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 42498be05eea..b4e4397b44c9 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5845,6 +5845,41 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
   let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
 }
 
+let Predicates = [HasMVEInt] in {
+  // Double lane moves. There are a number of patterns here. We know that the
+  // insertelt's will be in descending order by index, and need to match the 5
+  // patterns that might contain 2-0 or 3-1 pairs. These are:
+  // 3 2 1 0    -> vmovqrr 31; vmovqrr 20
+  // 3 2 1      -> vmovqrr 31; vmov 2
+  // 3 1        -> vmovqrr 31
+  // 2 1 0      -> vmovqrr 20; vmov 1
+  // 2 0        -> vmovqrr 20
+  // The other potential patterns will be handled by single lane inserts.
+  def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                                        rGPR:$srcA, (i32 0)),
+                                             rGPR:$srcB, (i32 1)),
+                                  rGPR:$srcC, (i32 2)),
+                       rGPR:$srcD, (i32 3)),
+            (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcC, (i32 2), (i32 0)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                             rGPR:$srcB, (i32 1)),
+                                  rGPR:$srcC, (i32 2)),
+                       rGPR:$srcD, (i32 3)),
+            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)),
+            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                             rGPR:$srcB, (i32 0)),
+                                  rGPR:$srcC, (i32 1)),
+                       rGPR:$srcD, (i32 2)),
+            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 2), (i32 0))>;
+  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)),
+            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 2), (i32 0))>;
+}
+
 // end of coproc mov
 
 // start of MVE interleaving load/store

diff  --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 729493163b81..b356e09d26a8 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -39,28 +39,24 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    adr r3, .LCPI1_0
 ; CHECK-NEXT:    vdup.32 q1, r1
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    ldr r3, [sp, #32]
 ; CHECK-NEXT:    vadd.i32 q2, q0, r1
 ; CHECK-NEXT:    vdup.32 q0, r2
 ; CHECK-NEXT:    vcmp.u32 hi, q1, q2
-; CHECK-NEXT:    ldr r2, [sp, #32]
+; CHECK-NEXT:    ldr r2, [sp, #40]
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    ldr r2, [sp, #36]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    ldr r2, [sp, #40]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    ldr r2, [sp, #44]
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    ldr r2, [sp]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    ldr r2, [sp, #4]
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    ldr r3, [sp, #36]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    ldr r2, [sp, #8]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    ldr r3, [sp]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    ldr r2, [sp, #12]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    ldr r3, [sp, #4]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    adr r2, .LCPI1_1
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
@@ -70,21 +66,19 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    vcmp.u32 hi, q1, q2
 ; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    ldr r2, [sp, #48]
 ; CHECK-NEXT:    vmsr p0, r1
-; CHECK-NEXT:    ldr r1, [sp, #48]
+; CHECK-NEXT:    ldr r1, [sp, #52]
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    ldr r1, [sp, #52]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
 ; CHECK-NEXT:    ldr r1, [sp, #56]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    ldr r1, [sp, #20]
+; CHECK-NEXT:    ldr r2, [sp, #16]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    ldr r1, [sp, #24]
-; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s1
@@ -413,81 +407,75 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) {
 ; CHECK-LABEL: test_width2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB4_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    adds r0, r2, #1
-; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r2
 ; CHECK-NEXT:    bic r0, r0, #1
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adr r2, .LCPI4_0
 ; CHECK-NEXT:    subs r0, #2
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    vldrw.u32 q2, [r2]
 ; CHECK-NEXT:    add.w lr, r3, r0, lsr #1
-; CHECK-NEXT:    adr r3, .LCPI4_0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    vldrw.u32 q1, [r3]
-; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vand q1, q1, q0
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov.32 q3[0], r6
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.32 q3[2], r6
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov q3[2], q3[0], r8, r8
+; CHECK-NEXT:    vmov r7, s6
 ; CHECK-NEXT:    vand q3, q3, q0
-; CHECK-NEXT:    adds r6, #2
+; CHECK-NEXT:    vmov r6, s7
 ; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    add.w r8, r8, #2
+; CHECK-NEXT:    vmov r9, s12
 ; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    adc r12, r2, #0
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r9, r3
 ; CHECK-NEXT:    vand q3, q3, q0
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    teq.w r4, r2
-; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q4[0], r2
-; CHECK-NEXT:    vmov.32 q4[1], r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    eors r3, r2
-; CHECK-NEXT:    orrs.w r3, r3, r12
-; CHECK-NEXT:    cset r3, ne
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs r5, r4, r5
-; CHECK-NEXT:    vmov.32 q4[2], r3
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    vmov.32 q4[3], r3
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    veor q4, q4, q1
-; CHECK-NEXT:    sbcs.w r0, r3, r0
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    adc r12, r2, #0
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r4, s15
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    subs r7, r5, r7
+; CHECK-NEXT:    vmov r7, s12
+; CHECK-NEXT:    sbcs r4, r6
+; CHECK-NEXT:    vmov r6, s13
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    @ implicit-def: $q3
-; CHECK-NEXT:    sbcs r0, r3
+; CHECK-NEXT:    movlo r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    subs r2, r7, r2
+; CHECK-NEXT:    sbcs.w r0, r6, r0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q4, q4, q5
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r0, r4
+; CHECK-NEXT:    eor.w r0, r5, r3
+; CHECK-NEXT:    orrs.w r0, r0, r12
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    teq.w r7, r9
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r0
+; CHECK-NEXT:    veor q4, q4, q2
+; CHECK-NEXT:    vand q4, q4, q3
+; CHECK-NEXT:    @ implicit-def: $q3
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    and r2, r2, #1
@@ -519,8 +507,9 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroe
 ; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  .LBB4_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI4_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll
index 8a9b8814ef2e..4cba3ff05ed8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-abs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll
@@ -40,23 +40,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
 ; CHECK-LABEL: abs_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    adds.w r1, r1, r0, asr #31
-; CHECK-NEXT:    adc.w r2, r0, r0, asr #31
-; CHECK-NEXT:    eor.w r2, r2, r0, asr #31
-; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
-; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    adds.w r1, r1, r0, asr #31
+; CHECK-NEXT:    adc.w r12, r0, r0, asr #31
 ; CHECK-NEXT:    eor.w r1, r1, r0, asr #31
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    adc.w r1, r0, r0, asr #31
-; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    adds.w r2, r2, r3, asr #31
+; CHECK-NEXT:    eor.w r0, r12, r0, asr #31
+; CHECK-NEXT:    eor.w r2, r2, r3, asr #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    adc.w r1, r3, r3, asr #31
+; CHECK-NEXT:    eor.w r1, r1, r3, asr #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = icmp slt <2 x i64> %s1, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index dc1ed2e187fb..5704ca95e2b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -5,22 +5,20 @@
 define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: udiv_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    udiv r1, r2, r1
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    udiv r1, r2, r1
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    udiv r1, r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -31,22 +29,20 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-LABEL: sdiv_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    sdiv r1, r2, r1
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -59,26 +55,24 @@ define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    udiv r2, r1, r0
 ; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    udiv r3, r2, r1
 ; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    udiv r0, r3, r2
-; CHECK-NEXT:    mls r0, r0, r2, r3
 ; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    udiv r0, r3, r2
+; CHECK-NEXT:    mls r0, r0, r2, r3
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    udiv r1, r3, r2
-; CHECK-NEXT:    vmov.32 q0[1], lr
-; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = urem <4 x i32> %in1, %in2
@@ -90,26 +84,24 @@ define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sdiv r2, r1, r0
 ; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    sdiv r3, r2, r1
 ; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    sdiv r0, r3, r2
-; CHECK-NEXT:    mls r0, r0, r2, r3
 ; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    sdiv r0, r3, r2
+; CHECK-NEXT:    mls r0, r0, r2, r3
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    sdiv r1, r3, r2
-; CHECK-NEXT:    vmov.32 q0[1], lr
-; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = srem <4 x i32> %in1, %in2
@@ -739,31 +731,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: udiv_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    bl __aeabi_uldivmod
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmov r0, s20
 ; CHECK-NEXT:    vmov r1, s21
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = udiv <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -772,31 +761,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: sdiv_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    bl __aeabi_ldivmod
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmov r0, s20
 ; CHECK-NEXT:    vmov r1, s21
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = sdiv <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -805,31 +791,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: urem_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    bl __aeabi_uldivmod
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmov r0, s20
 ; CHECK-NEXT:    vmov r1, s21
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov r12, s18
-; CHECK-NEXT:    vmov lr, s19
-; CHECK-NEXT:    vmov.32 q4[0], r2
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    mov r3, lr
-; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q4[3], r3
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = urem <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -838,31 +821,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: srem_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    bl __aeabi_ldivmod
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmov r0, s20
 ; CHECK-NEXT:    vmov r1, s21
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov r12, s18
-; CHECK-NEXT:    vmov lr, s19
-; CHECK-NEXT:    vmov.32 q4[0], r2
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    mov r3, lr
-; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q4[3], r3
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = srem <2 x i64> %in1, %in2
   ret <2 x i64> %out

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 0f3a91ca31af..c4f68959ecf4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -588,51 +588,47 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:  .LBB11_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r7, q2[4]
-; CHECK-NEXT:    vmov.u16 r5, q2[0]
-; CHECK-NEXT:    vmov.32 q4[0], r7
-; CHECK-NEXT:    vmov.u16 r7, q2[5]
-; CHECK-NEXT:    vmov.32 q4[1], r7
 ; CHECK-NEXT:    vmov.u16 r7, q2[6]
-; CHECK-NEXT:    vmov.32 q4[2], r7
-; CHECK-NEXT:    vmov.u16 r7, q2[7]
-; CHECK-NEXT:    vmov.32 q4[3], r7
-; CHECK-NEXT:    vmov.32 q3[0], r5
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r7
+; CHECK-NEXT:    vmov.u16 r3, q2[7]
+; CHECK-NEXT:    vmov.u16 r7, q2[5]
+; CHECK-NEXT:    vmov.u16 r5, q2[2]
+; CHECK-NEXT:    vmov q4[3], q4[1], r7, r3
+; CHECK-NEXT:    vmov.u16 r6, q2[0]
 ; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vmov.u16 r5, q2[1]
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
 ; CHECK-NEXT:    vshl.i32 q4, q4, #1
-; CHECK-NEXT:    vmov.32 q3[1], r5
-; CHECK-NEXT:    vadd.i32 q4, q4, r0
-; CHECK-NEXT:    vmov.u16 r5, q2[2]
-; CHECK-NEXT:    vmov r7, s16
-; CHECK-NEXT:    vmov.32 q3[2], r5
 ; CHECK-NEXT:    vmov.u16 r5, q2[3]
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov.32 q3[3], r5
-; CHECK-NEXT:    vadd.i16 q2, q2, q1
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    vmov.u16 r6, q2[1]
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    vmov r7, s17
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
+; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
 ; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    vmov r12, s13
-; CHECK-NEXT:    ldrh.w r11, [r7]
-; CHECK-NEXT:    vmov r7, s12
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh.w r11, [r3]
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    ldrh r7, [r7]
 ; CHECK-NEXT:    ldrh.w r9, [r5]
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    ldrh.w r10, [r6]
 ; CHECK-NEXT:    vmov r6, s19
 ; CHECK-NEXT:    ldrh.w r1, [r12]
-; CHECK-NEXT:    ldrh r7, [r7]
-; CHECK-NEXT:    vmov.16 q3[0], r7
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q3[0], r3
 ; CHECK-NEXT:    vmov.16 q3[1], r1
 ; CHECK-NEXT:    vmov.16 q3[2], r10
-; CHECK-NEXT:    vmov.16 q3[3], r9
-; CHECK-NEXT:    vmov.16 q3[4], r11
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.16 q3[5], r3
+; CHECK-NEXT:    vmov.16 q3[3], r9
 ; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    vmov.16 q3[4], r11
+; CHECK-NEXT:    vmov.16 q3[5], r7
 ; CHECK-NEXT:    vmov.16 q3[6], r5
 ; CHECK-NEXT:    vmov.16 q3[7], r6
 ; CHECK-NEXT:    vstrb.8 q3, [r4], #16
@@ -704,26 +700,27 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB12_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r8, r2, #7
+; CHECK-NEXT:    bic r1, r2, #7
 ; CHECK-NEXT:    adr r6, .LCPI12_2
-; CHECK-NEXT:    sub.w r3, r8, #8
+; CHECK-NEXT:    sub.w r3, r1, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
+; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
 ; CHECK-NEXT:    movs r7, #1
-; CHECK-NEXT:    vmov.i16 q3, #0x18
 ; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
 ; CHECK-NEXT:    adr r3, .LCPI12_0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    adr r7, .LCPI12_1
-; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    vmov.i16 q3, #0x18
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    str r1, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #52] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
@@ -732,42 +729,28 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:  .LBB12_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov.u16 r3, q5[0]
-; CHECK-NEXT:    vmov.u16 r7, q7[4]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u16 r3, q5[1]
-; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[2]
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u16 r5, q5[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[3]
-; CHECK-NEXT:    vmov.32 q0[3], r3
-; CHECK-NEXT:    vmov.u16 r12, q6[0]
+; CHECK-NEXT:    vmov.u16 r5, q5[1]
+; CHECK-NEXT:    vmov.u16 r7, q7[6]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov.u16 r5, q5[4]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov.32 q1[0], r12
+; CHECK-NEXT:    vmov.u16 r12, q7[4]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vmov.u16 r1, q6[1]
+; CHECK-NEXT:    vmov.u16 r1, q7[5]
 ; CHECK-NEXT:    vadd.i32 q2, q0, r0
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov.u16 r1, q6[2]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q6[3]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q6[4]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    vmov r6, s11
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
 ; CHECK-NEXT:    ldrh.w r9, [r3]
-; CHECK-NEXT:    vmov.u16 r3, q5[4]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u16 r3, q5[5]
-; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[7]
-; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vmov.u16 r5, q5[5]
 ; CHECK-NEXT:    ldrh r6, [r6]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
@@ -778,42 +761,44 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    ldrh r5, [r5]
 ; CHECK-NEXT:    ldrh.w r11, [r3]
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.32 q0[0], r7
-; CHECK-NEXT:    vmov.u16 r7, q7[5]
-; CHECK-NEXT:    vmov.32 q0[1], r7
-; CHECK-NEXT:    vmov.u16 r7, q7[6]
-; CHECK-NEXT:    vmov.32 q0[2], r7
+; CHECK-NEXT:    vmov q0[2], q0[0], r12, r7
 ; CHECK-NEXT:    vmov.u16 r7, q7[7]
-; CHECK-NEXT:    vmov.32 q0[3], r7
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r7
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q6[5]
-; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    ldrh.w r8, [r3]
+; CHECK-NEXT:    vmov.u16 r3, q6[0]
+; CHECK-NEXT:    ldrh r7, [r1]
+; CHECK-NEXT:    vmov.u16 r1, q6[2]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r1, q6[3]
+; CHECK-NEXT:    vmov.u16 r3, q6[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
 ; CHECK-NEXT:    vmov.u16 r1, q6[6]
-; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u16 r3, q6[4]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r1, q6[7]
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[0]
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[1]
-; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r3, q6[5]
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vmov.u16 r1, q7[2]
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.u16 r3, q7[0]
+; CHECK-NEXT:    vadd.i32 q4, q1, r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r1, q7[3]
-; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.u16 r3, q7[1]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vshl.i32 q3, q3, #1
+; CHECK-NEXT:    vmovlb.s16 q3, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vshl.i32 q3, q3, #1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    ldrh r7, [r7]
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
 ; CHECK-NEXT:    vmov r1, s9
@@ -824,7 +809,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov.16 q1[3], r6
 ; CHECK-NEXT:    vmov.16 q1[4], r10
 ; CHECK-NEXT:    vmov.16 q1[5], r11
-; CHECK-NEXT:    vmov.16 q1[6], r3
+; CHECK-NEXT:    vmov.16 q1[6], r8
 ; CHECK-NEXT:    vmov.16 q1[7], r5
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
@@ -882,7 +867,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT:    cmp r8, r2
+; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    cmp r1, r2
 ; CHECK-NEXT:    bne.w .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #104

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
index 20e258d46b5e..053bfa12c117 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
@@ -461,18 +461,16 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q1, #0x10
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    bx lr
 entry:
   %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
index 236a695c0a5d..c4a469856cb1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
@@ -63,15 +63,13 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr)
 ; CHECK-NEXT:    ldrb r2, [r1]
 ; CHECK-NEXT:    vmov.i32 q0, #0xff
 ; CHECK-NEXT:    ldrb r1, [r1, #1]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ldrb r1, [r0, r1]
 ; CHECK-NEXT:    ldrb r0, [r0, r2]
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i8>, <2 x i8>* %offptr, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 2a86ddbede65..f68aa0f8baa0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -9,8 +9,7 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) {
 ; CHECK-NEXT:    ldrd r1, r0, [r0]
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4
@@ -38,29 +37,25 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    ldr.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    ldr.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldr r4, [r4]
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    ldr r5, [r5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
@@ -71,61 +66,53 @@ entry:
 define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) {
 ; CHECK-LABEL: ptr_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r6, s7
-; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r6, s4
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r7, s5
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    ldr.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    ldr r5, [r5]
+; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    ldr r7, [r7]
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r4, [r4]
 ; CHECK-NEXT:    ldr.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    ldr r3, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov.32 q0[0], r5
-; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r5
+; CHECK-NEXT:    vmov r6, s7
+; CHECK-NEXT:    vmov r5, s11
 ; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov.32 q0[2], r5
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.32 q0[3], r6
-; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r6
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    ldr r6, [r6]
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r0
 ; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.32 q3[0], lr
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov r6, s13
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, lr
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    ldr r6, [r6]
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r12
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.32 q2[2], r12
-; CHECK-NEXT:    vmov.32 q2[3], r4
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r5
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
   %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@@ -233,12 +220,10 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) {
 ; CHECK-NEXT:    ldrd r1, r0, [r0]
 ; CHECK-NEXT:    ldrsh.w r0, [r0]
 ; CHECK-NEXT:    ldrsh.w r1, [r1]
-; CHECK-NEXT:    asrs r2, r0, #31
-; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
@@ -254,8 +239,7 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) {
 ; CHECK-NEXT:    vmov.i64 q0, #0xffff
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -269,18 +253,16 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -294,18 +276,16 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldrh r2, [r2]
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -323,30 +303,26 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -364,30 +340,26 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldrh r4, [r4]
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -593,18 +565,16 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i8_sext32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    bx lr
@@ -620,18 +590,16 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -649,31 +617,27 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -692,31 +656,27 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r4, s3
 ; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    ldrb.w lr, [r0]
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -844,33 +804,29 @@ define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base,
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    ldrb.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r3, s7
 ; CHECK-NEXT:    ldrb r0, [r0]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    vmov.32 q0[3], lr
-; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r3
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
index 17b28811fd00..6b89010baf3d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
@@ -7,18 +7,16 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>*
 ; NOGATSCAT:       @ %bb.0: @ %entry
 ; NOGATSCAT-NEXT:    vldrw.u32 q0, [r1]
 ; NOGATSCAT-NEXT:    vadd.i32 q0, q0, r0
-; NOGATSCAT-NEXT:    vmov r0, s0
-; NOGATSCAT-NEXT:    vmov r3, s1
-; NOGATSCAT-NEXT:    vmov r1, s2
+; NOGATSCAT-NEXT:    vmov r0, s2
+; NOGATSCAT-NEXT:    vmov r1, s0
 ; NOGATSCAT-NEXT:    vmov r2, s3
+; NOGATSCAT-NEXT:    vmov r3, s1
 ; NOGATSCAT-NEXT:    ldr r0, [r0]
-; NOGATSCAT-NEXT:    ldr r3, [r3]
-; NOGATSCAT-NEXT:    vmov.32 q0[0], r0
 ; NOGATSCAT-NEXT:    ldr r1, [r1]
-; NOGATSCAT-NEXT:    vmov.32 q0[1], r3
 ; NOGATSCAT-NEXT:    ldr r2, [r2]
-; NOGATSCAT-NEXT:    vmov.32 q0[2], r1
-; NOGATSCAT-NEXT:    vmov.32 q0[3], r2
+; NOGATSCAT-NEXT:    ldr r3, [r3]
+; NOGATSCAT-NEXT:    vmov q0[2], q0[0], r1, r0
+; NOGATSCAT-NEXT:    vmov q0[3], q0[1], r3, r2
 ; NOGATSCAT-NEXT:    bx lr
 ;
 ; NOMVE-LABEL: unscaled_i32_i32_gather:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index a3f06e5d2537..c7eed2631c32 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -93,64 +93,61 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q1
-; CHECK-LE-NEXT:    movs r4, #0
-; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
-; CHECK-LE-NEXT:    vmov.32 q0[0], lr
-; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-LE-NEXT:    mov.w lr, #0
-; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w lr, #1
-; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], lr, r5
+; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    cmp r1, #0
+; CHECK-LE-NEXT:    rsbs r4, r5, #0
+; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    it lt
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r1, #1
-; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-LE-NEXT:    vmov.32 q0[2], r12
-; CHECK-LE-NEXT:    and r3, r1, #3
-; CHECK-LE-NEXT:    lsls r1, r1, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    vmov r4, s4
+; CHECK-LE-NEXT:    and r12, r3, #3
+; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
-; CHECK-LE-NEXT:    lsls r1, r3, #30
+; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
+; CHECK-LE-NEXT:    lsls.w r1, r12, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov r1, s6
-; CHECK-LE-NEXT:    vmov.32 q1[0], r3
-; CHECK-LE-NEXT:    rsbs r5, r2, #0
-; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
-; CHECK-LE-NEXT:    vmov r2, s2
+; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
+; CHECK-LE-NEXT:    vmov r3, s0
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmov r1, s2
+; CHECK-LE-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-LE-NEXT:    rsbs r5, r4, #0
 ; CHECK-LE-NEXT:    asr.w lr, r3, #31
-; CHECK-LE-NEXT:    vmov.32 q1[1], lr
+; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    asr.w r12, r1, #31
-; CHECK-LE-NEXT:    vmov.32 q1[2], r1
+; CHECK-LE-NEXT:    sbcs.w r1, r2, r4, asr #31
 ; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    vmov.32 q1[3], r12
-; CHECK-LE-NEXT:    rsbs r3, r2, #0
-; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT:    rsbs r5, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r4, #1
-; CHECK-LE-NEXT:    cmp r4, #0
+; CHECK-LE-NEXT:    movlt r2, #1
+; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r4, #1
-; CHECK-LE-NEXT:    bfi r4, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r4, #3
-; CHECK-LE-NEXT:    lsls r2, r4, #31
+; CHECK-LE-NEXT:    mvnne r2, #1
+; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r2, #3
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne d2, [r0]
+; CHECK-LE-NEXT:    vstrne d0, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi d3, [r0, #8]
+; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
 ;
@@ -164,9 +161,7 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov.32 q0[1], r12
-; CHECK-BE-NEXT:    @ implicit-def: $q2
-; CHECK-BE-NEXT:    vmov.32 q0[3], lr
+; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -178,6 +173,7 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
+; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB5_2
@@ -199,17 +195,15 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vmov r2, s11
 ; CHECK-BE-NEXT:    movs r4, #0
-; CHECK-BE-NEXT:    vmov r3, s1
 ; CHECK-BE-NEXT:    vmov r1, s3
+; CHECK-BE-NEXT:    vmov r3, s1
 ; CHECK-BE-NEXT:    rsbs r5, r2, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-BE-NEXT:    vmov r2, s9
-; CHECK-BE-NEXT:    asr.w lr, r3, #31
-; CHECK-BE-NEXT:    vmov.32 q1[0], lr
 ; CHECK-BE-NEXT:    asr.w r12, r1, #31
-; CHECK-BE-NEXT:    vmov.32 q1[1], r3
-; CHECK-BE-NEXT:    vmov.32 q1[2], r12
-; CHECK-BE-NEXT:    vmov.32 q1[3], r1
+; CHECK-BE-NEXT:    asr.w lr, r3, #31
+; CHECK-BE-NEXT:    vmov q1[2], q1[0], lr, r12
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
@@ -247,65 +241,62 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q1
-; CHECK-LE-NEXT:    movs r4, #0
-; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
-; CHECK-LE-NEXT:    vmov.32 q0[0], lr
-; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-LE-NEXT:    mov.w lr, #0
-; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w lr, #1
-; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], lr, r5
+; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    cmp r1, #0
+; CHECK-LE-NEXT:    rsbs r4, r5, #0
+; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    it lt
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r1, #1
-; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-LE-NEXT:    vmov.32 q0[2], r12
-; CHECK-LE-NEXT:    and r3, r1, #3
-; CHECK-LE-NEXT:    lsls r1, r1, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    vmov r4, s4
+; CHECK-LE-NEXT:    and r12, r3, #3
+; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
-; CHECK-LE-NEXT:    lsls r1, r3, #30
+; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
+; CHECK-LE-NEXT:    lsls.w r1, r12, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
-; CHECK-LE-NEXT:    vmov r2, s0
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov r1, s6
-; CHECK-LE-NEXT:    vmov.32 q1[0], r3
-; CHECK-LE-NEXT:    rsbs r5, r2, #0
-; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
-; CHECK-LE-NEXT:    vmov r2, s2
+; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
+; CHECK-LE-NEXT:    vmov r3, s0
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmov r1, s2
+; CHECK-LE-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-LE-NEXT:    rsbs r5, r4, #0
 ; CHECK-LE-NEXT:    asr.w lr, r3, #31
-; CHECK-LE-NEXT:    vmov.32 q1[1], lr
+; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    asr.w r12, r1, #31
-; CHECK-LE-NEXT:    vmov.32 q1[2], r1
+; CHECK-LE-NEXT:    sbcs.w r1, r2, r4, asr #31
 ; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    vmov.32 q1[3], r12
-; CHECK-LE-NEXT:    rsbs r3, r2, #0
-; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT:    rsbs r5, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r4, #1
-; CHECK-LE-NEXT:    cmp r4, #0
+; CHECK-LE-NEXT:    movlt r2, #1
+; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r4, #1
-; CHECK-LE-NEXT:    bfi r4, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r4, #3
-; CHECK-LE-NEXT:    lsls r2, r4, #31
+; CHECK-LE-NEXT:    mvnne r2, #1
+; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r2, #3
+; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, r3, d2
+; CHECK-LE-NEXT:    vmovne r2, r3, d0
 ; CHECK-LE-NEXT:    strdne r2, r3, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, r2, d3
+; CHECK-LE-NEXT:    vmovmi r1, r2, d1
 ; CHECK-LE-NEXT:    strdmi r1, r2, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
@@ -320,9 +311,7 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov.32 q0[1], r12
-; CHECK-BE-NEXT:    @ implicit-def: $q2
-; CHECK-BE-NEXT:    vmov.32 q0[3], lr
+; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -334,6 +323,7 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
+; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB6_2
@@ -355,17 +345,15 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vmov r2, s11
 ; CHECK-BE-NEXT:    movs r4, #0
-; CHECK-BE-NEXT:    vmov r3, s1
 ; CHECK-BE-NEXT:    vmov r1, s3
+; CHECK-BE-NEXT:    vmov r3, s1
 ; CHECK-BE-NEXT:    rsbs r5, r2, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-BE-NEXT:    vmov r2, s9
-; CHECK-BE-NEXT:    asr.w lr, r3, #31
-; CHECK-BE-NEXT:    vmov.32 q1[0], lr
 ; CHECK-BE-NEXT:    asr.w r12, r1, #31
-; CHECK-BE-NEXT:    vmov.32 q1[1], r3
-; CHECK-BE-NEXT:    vmov.32 q1[2], r12
-; CHECK-BE-NEXT:    vmov.32 q1[3], r1
+; CHECK-BE-NEXT:    asr.w lr, r3, #31
+; CHECK-BE-NEXT:    vmov q1[2], q1[0], lr, r12
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
@@ -401,64 +389,63 @@ entry:
 define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
 ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r7, lr}
-; CHECK-LE-NEXT:    push {r7, lr}
+; CHECK-LE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q1
+; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
-; CHECK-LE-NEXT:    vmov.32 q0[0], lr
-; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-LE-NEXT:    mov.w lr, #0
-; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w lr, #1
-; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], lr, r5
+; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    cmp r1, #0
+; CHECK-LE-NEXT:    rsbs r4, r5, #0
+; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    it lt
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r1, #1
-; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-LE-NEXT:    vmov.32 q0[2], r12
-; CHECK-LE-NEXT:    and r3, r1, #3
-; CHECK-LE-NEXT:    mov.w r12, #0
-; CHECK-LE-NEXT:    lsls r1, r1, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    and r12, r3, #3
+; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
-; CHECK-LE-NEXT:    lsls r1, r3, #30
+; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
+; CHECK-LE-NEXT:    lsls.w r1, r12, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
-; CHECK-LE-NEXT:    vmov r1, s0
+; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
+; CHECK-LE-NEXT:    vmov r1, s4
 ; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vand q1, q1, q2
+; CHECK-LE-NEXT:    vand q0, q0, q2
 ; CHECK-LE-NEXT:    rsbs r3, r1, #0
-; CHECK-LE-NEXT:    vmov r3, s2
+; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    sbcs.w r1, r2, r1, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w r12, #1
-; CHECK-LE-NEXT:    rsbs r1, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    rsbs r5, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r12, #0, #1
+; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
 ; CHECK-LE-NEXT:    and r1, r2, #3
 ; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne d2, [r0]
+; CHECK-LE-NEXT:    vstrne d0, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi d3, [r0, #8]
+; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    pop {r7, pc}
+; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
@@ -470,9 +457,7 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov.32 q0[1], r12
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmov.32 q0[3], lr
+; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -484,6 +469,7 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
+; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB7_2
@@ -541,66 +527,65 @@ entry:
 define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
 ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r7, lr}
-; CHECK-LE-NEXT:    push {r7, lr}
+; CHECK-LE-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
-; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q1
+; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
+; CHECK-LE-NEXT:    movs r3, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q0
 ; CHECK-LE-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
-; CHECK-LE-NEXT:    vmov.32 q0[0], lr
-; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-LE-NEXT:    mov.w lr, #0
-; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w lr, #1
-; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], lr, r5
+; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    cmp r1, #0
+; CHECK-LE-NEXT:    rsbs r4, r5, #0
+; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    it lt
+; CHECK-LE-NEXT:    movlt r3, #1
+; CHECK-LE-NEXT:    cmp r3, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r1, #1
-; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-LE-NEXT:    vmov.32 q0[2], r12
-; CHECK-LE-NEXT:    and r3, r1, #3
-; CHECK-LE-NEXT:    mov.w r12, #0
-; CHECK-LE-NEXT:    lsls r1, r1, #31
+; CHECK-LE-NEXT:    mvnne r3, #1
+; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
+; CHECK-LE-NEXT:    and r12, r3, #3
+; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
-; CHECK-LE-NEXT:    lsls r1, r3, #30
+; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
+; CHECK-LE-NEXT:    lsls.w r1, r12, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
-; CHECK-LE-NEXT:    vmov r1, s0
+; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
+; CHECK-LE-NEXT:    vmov r1, s4
 ; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vand q1, q1, q2
+; CHECK-LE-NEXT:    vand q0, q0, q2
 ; CHECK-LE-NEXT:    rsbs r3, r1, #0
-; CHECK-LE-NEXT:    vmov r3, s2
+; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    sbcs.w r1, r2, r1, asr #31
+; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt.w r12, #1
-; CHECK-LE-NEXT:    rsbs r1, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    rsbs r5, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r12, #0, #1
+; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
 ; CHECK-LE-NEXT:    and r1, r2, #3
 ; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, r3, d2
+; CHECK-LE-NEXT:    vmovne r2, r3, d0
 ; CHECK-LE-NEXT:    strdne r2, r3, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, r2, d3
+; CHECK-LE-NEXT:    vmovmi r1, r2, d1
 ; CHECK-LE-NEXT:    strdmi r1, r2, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    pop {r7, pc}
+; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned:
 ; CHECK-BE:       @ %bb.0: @ %entry
@@ -612,9 +597,7 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov.32 q0[1], r12
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    vmov.32 q0[3], lr
+; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -626,6 +609,7 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
+; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB8_2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index c533216127bb..75da80625c81 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -62,10 +62,8 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -136,10 +134,8 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -211,10 +207,8 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -285,10 +279,8 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -394,10 +386,8 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do
 ; CHECK-NEXT:    movne r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
 ; CHECK-NEXT:    vbic q1, q5, q0
 ; CHECK-NEXT:    vand q0, q4, q0
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-neg.ll b/llvm/test/CodeGen/Thumb2/mve-neg.ll
index 2d8d0f4ac519..ea4ef0921a97 100644
--- a/llvm/test/CodeGen/Thumb2/mve-neg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-neg.ll
@@ -43,10 +43,8 @@ define arm_aapcs_vfpcc <2 x i64> @neg_v2i64(<2 x i64> %s1) {
 ; CHECK-NEXT:    sbc.w r0, r12, r0
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sbc.w r3, r12, r3
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <2 x i64> zeroinitializer, %s1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 0fe26fbc4753..252f9c6439ec 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -149,44 +149,38 @@ define dso_local i32 @e() #0 {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    .pad #392
 ; CHECK-NEXT:    sub sp, #392
 ; CHECK-NEXT:    movw r9, :lower16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s0, .LCPI1_0
 ; CHECK-NEXT:    movt r9, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s3, .LCPI1_1
-; CHECK-NEXT:    mov r5, r9
 ; CHECK-NEXT:    mov r7, r9
-; CHECK-NEXT:    ldr r1, [r5, #8]!
-; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    mov r5, r9
 ; CHECK-NEXT:    ldr r0, [r7, #4]!
 ; CHECK-NEXT:    movw r4, :lower16:e
-; CHECK-NEXT:    vmov.32 q4[0], r5
+; CHECK-NEXT:    ldr r1, [r5, #8]!
 ; CHECK-NEXT:    movt r4, :upper16:e
-; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vdup.32 q4, r7
 ; CHECK-NEXT:    vmov s1, r7
-; CHECK-NEXT:    vmov.32 q1[1], r6
-; CHECK-NEXT:    vmov.32 q5[0], r7
-; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r5
 ; CHECK-NEXT:    vmov s9, r4
-; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    vdup.32 q6, r7
-; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
-; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vmov.32 q1[1], r7
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r4
 ; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov.32 q1[2], r6
-; CHECK-NEXT:    vmov q3, q6
-; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    vmov q5, q4
 ; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
+; CHECK-NEXT:    vmov q1[2], q1[0], r7, r6
 ; CHECK-NEXT:    mov.w r8, #4
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q1[3], q1[1], r7, r4
 ; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    vmov.32 q7[1], r4
+; CHECK-NEXT:    vmov.32 q5[1], r4
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    vmov.f32 s11, s3
 ; CHECK-NEXT:    movs r1, #64
@@ -195,21 +189,19 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #44]
 ; CHECK-NEXT:    str r0, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
-; CHECK-NEXT:    vstrw.32 q7, [r0]
+; CHECK-NEXT:    vstrw.32 q5, [r0]
 ; CHECK-NEXT:    vstrw.32 q3, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bl __aeabi_memclr4
-; CHECK-NEXT:    vmov.32 q5[1], r5
-; CHECK-NEXT:    vmov.32 q4[1], r4
-; CHECK-NEXT:    vmov.32 q5[2], r7
-; CHECK-NEXT:    vmov.32 q4[2], r7
-; CHECK-NEXT:    vmov.32 q5[3], r6
-; CHECK-NEXT:    vmov.32 q6[0], r10
-; CHECK-NEXT:    vmov.32 q4[3], r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r7
+; CHECK-NEXT:    vmov q1[2], q1[0], r7, r7
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r6
+; CHECK-NEXT:    vmov.32 q4[0], r10
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    str.w r10, [r9]
 ; CHECK-NEXT:    vstrw.32 q4, [r0]
-; CHECK-NEXT:    vstrw.32 q6, [r0]
-; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    str.w r8, [sp, #308]
 ; CHECK-NEXT:  .LBB1_1: @ %for.cond
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
index 9848a56b9f33..17d83c8e0988 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
@@ -575,24 +575,22 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q2, q0, q1
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s9
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q1, q1, q3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vbic q1, q1, q2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -606,47 +604,43 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s9
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -662,42 +656,38 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c) {
 ; CHECK-LABEL: cmpeqr_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    eors r0, r2
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    eors r1, r3
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    eors r0, r3
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
index c7e553fa3510..5dfd5a7074e5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
@@ -12,15 +12,13 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-LE-NEXT:    vmsr p0, r0
 ; CHECK-LE-NEXT:    vpsel q1, q2, q1
-; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-LE-NEXT:    vmov.32 q2[0], r0
-; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-LE-NEXT:    vmov.32 q2[1], r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-LE-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-LE-NEXT:    vmov.32 q2[3], r0
+; CHECK-LE-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    add sp, #4
@@ -35,17 +33,15 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
-; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-BE-NEXT:    vmov.32 q2[0], r0
-; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-BE-NEXT:    vmov.32 q2[1], r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-BE-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-BE-NEXT:    vmov.32 q2[3], r0
+; CHECK-BE-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    add sp, #4
@@ -179,13 +175,11 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    and r1, r0, #2
 ; CHECK-LE-NEXT:    and r0, r0, #1
-; CHECK-LE-NEXT:    rsbs r0, r0, #0
 ; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmov.32 q1[0], r0
+; CHECK-LE-NEXT:    rsbs r0, r0, #0
 ; CHECK-LE-NEXT:    sub.w r1, r2, r1, lsr #1
-; CHECK-LE-NEXT:    vmov.32 q1[1], r0
-; CHECK-LE-NEXT:    vmov.32 q1[2], r1
-; CHECK-LE-NEXT:    vmov.32 q1[3], r1
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-LE-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-LE-NEXT:    vand q0, q0, q1
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
@@ -195,14 +189,12 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    and r1, r0, #2
-; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    and r0, r0, #1
-; CHECK-BE-NEXT:    sub.w r1, r2, r1, lsr #1
+; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    rsbs r0, r0, #0
-; CHECK-BE-NEXT:    vmov.32 q1[0], r1
-; CHECK-BE-NEXT:    vmov.32 q1[1], r1
-; CHECK-BE-NEXT:    vmov.32 q1[2], r0
-; CHECK-BE-NEXT:    vmov.32 q1[3], r0
+; CHECK-BE-NEXT:    sub.w r1, r2, r1, lsr #1
+; CHECK-BE-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vand q0, q0, q2
 ; CHECK-BE-NEXT:    add sp, #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index b88576a22cc2..c280fa2ed658 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -64,10 +64,8 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp sgt <2 x i64> %src, zeroinitializer
@@ -141,8 +139,7 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
@@ -201,16 +198,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @trunc_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: trunc_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    and r1, r1, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    and r0, r0, #1
-; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    and r1, r1, #1
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
index df6a38f2f981..a68250596711 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
@@ -10,15 +10,13 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-LE-NEXT:    vmsr p0, r0
 ; CHECK-LE-NEXT:    vpsel q1, q2, q1
-; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-LE-NEXT:    vmov.32 q2[0], r0
-; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-LE-NEXT:    vmov.32 q2[1], r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-LE-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-LE-NEXT:    vmov.32 q2[3], r0
+; CHECK-LE-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
+; CHECK-LE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
@@ -30,17 +28,15 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
-; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-BE-NEXT:    vmov.32 q2[0], r0
-; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-BE-NEXT:    vmov.32 q2[1], r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-BE-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-BE-NEXT:    vmov.32 q2[3], r0
+; CHECK-BE-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    bx lr
@@ -144,28 +140,24 @@ define arm_aapcs_vfpcc <2 x i64> @load_v2i1(<2 x i1> *%src, <2 x i64> %a) {
 ; CHECK-LE-LABEL: load_v2i1:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    ldrb r0, [r0]
-; CHECK-LE-NEXT:    and r1, r0, #1
+; CHECK-LE-NEXT:    ubfx r1, r0, #1, #1
+; CHECK-LE-NEXT:    and r0, r0, #1
 ; CHECK-LE-NEXT:    rsbs r1, r1, #0
-; CHECK-LE-NEXT:    ubfx r0, r0, #1, #1
-; CHECK-LE-NEXT:    vmov.32 q1[0], r1
 ; CHECK-LE-NEXT:    rsbs r0, r0, #0
-; CHECK-LE-NEXT:    vmov.32 q1[1], r1
-; CHECK-LE-NEXT:    vmov.32 q1[2], r0
-; CHECK-LE-NEXT:    vmov.32 q1[3], r0
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-LE-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-LE-NEXT:    vand q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: load_v2i1:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    ldrb r0, [r0]
-; CHECK-BE-NEXT:    ubfx r1, r0, #1, #1
-; CHECK-BE-NEXT:    and r0, r0, #1
+; CHECK-BE-NEXT:    and r1, r0, #1
+; CHECK-BE-NEXT:    ubfx r0, r0, #1, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
 ; CHECK-BE-NEXT:    rsbs r0, r0, #0
-; CHECK-BE-NEXT:    vmov.32 q1[0], r1
-; CHECK-BE-NEXT:    vmov.32 q1[1], r1
-; CHECK-BE-NEXT:    vmov.32 q1[2], r0
-; CHECK-BE-NEXT:    vmov.32 q1[3], r0
+; CHECK-BE-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vand q0, q0, q2
 ; CHECK-BE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
index 35fb1bfd6977..9ed3add2fd16 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
@@ -323,22 +323,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q0, q0, q2
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vorr q0, q1, q0
@@ -353,22 +351,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vbic q0, q0, q2
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vorr q0, q1, q0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
index fbc268fa9300..d498d7cb6f53 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
@@ -377,38 +377,34 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vorr q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -425,47 +421,43 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s9
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vorr q2, q2, q3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vorr q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index f6d3bafc1f01..b04be5e25f02 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -315,14 +315,12 @@ define <4 x i32> @shuffle5_b_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vcmp.i16 eq, q0, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -348,14 +346,12 @@ define <4 x i32> @shuffle5_t_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vcmp.i16 eq, q0, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
index f92a4bd958f9..6c13200a2d55 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
@@ -457,38 +457,34 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmpeqz_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    veor q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -505,47 +501,43 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: cmpeq_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s9
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    veor q2, q2, q3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    veor q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 4579c2714b97..95a0c9458c8e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -20,7 +20,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
@@ -32,69 +32,65 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w r6, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r10, r1, r3, lsl #2
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    mvn r10, #-2147483648
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r8, [r0]
+; CHECK-NEXT:    ldrd r5, r4, [r0]
+; CHECK-NEXT:    mov.w r3, #-1
+; CHECK-NEXT:    ldrd r8, r7, [r1]
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    ldrd r7, r5, [r1]
-; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    smull r8, r5, r5, r8
 ; CHECK-NEXT:    smull r4, r7, r7, r4
-; CHECK-NEXT:    asrl r8, r5, #31
+; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    asrl r4, r7, #31
+; CHECK-NEXT:    smull r6, r5, r8, r5
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    mov.w r9, #-1
-; CHECK-NEXT:    sbcs.w r3, r9, r7
+; CHECK-NEXT:    sbcs r3, r7
 ; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    vmov.32 q4[1], r7
+; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q4[2], r8
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q4[3], r5
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    rsbs.w r3, r8, #-2147483648
-; CHECK-NEXT:    sbcs.w r3, r9, r5
+; CHECK-NEXT:    csetm r9, ne
+; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
+; CHECK-NEXT:    mov.w r3, #-1
+; CHECK-NEXT:    vmov q4[2], q4[0], r6, r4
+; CHECK-NEXT:    sbcs r3, r5
+; CHECK-NEXT:    vmov q4[3], q4[1], r5, r7
 ; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    mvn r6, #-2147483648
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r9
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r9
 ; CHECK-NEXT:    vbic q3, q0, q2
 ; CHECK-NEXT:    vand q2, q4, q2
 ; CHECK-NEXT:    vorr q2, q2, q3
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    subs.w r4, r4, r10
+; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    subs r4, r4, r6
+; CHECK-NEXT:    vmov r4, s11
 ; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    subs r5, r5, r6
 ; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    subs.w r4, r4, r10
-; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
 ; CHECK-NEXT:    vbic q4, q1, q3
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vorr q2, q2, q4
@@ -116,7 +112,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r12], #4
-; CHECK-NEXT:    ldr r4, [r6], #4
+; CHECK-NEXT:    ldr r4, [r10], #4
 ; CHECK-NEXT:    smull r4, r3, r4, r3
 ; CHECK-NEXT:    asrl r4, r3, #31
 ; CHECK-NEXT:    subs r5, r1, r4
@@ -229,149 +225,141 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB1_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB1_3
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
-; CHECK-NEXT:    bic r7, r3, #3
-; CHECK-NEXT:    adr r4, .LCPI1_0
-; CHECK-NEXT:    subs r1, r7, #4
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    bic r3, r3, #3
+; CHECK-NEXT:    subs r7, r3, #4
+; CHECK-NEXT:    adr r4, .LCPI1_0
+; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
-; CHECK-NEXT:    str r7, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adr r4, .LCPI1_1
-; CHECK-NEXT:    add.w r11, r2, r7, lsl #2
-; CHECK-NEXT:    add.w r1, r9, r7, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r7, lsl #2
+; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    mov.w r10, #-1
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q3, [r9], #16
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    vmov.f32 s16, s10
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
-; CHECK-NEXT:    vmov.f32 s10, s9
-; CHECK-NEXT:    vmov r7, s25
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov r7, s27
+; CHECK-NEXT:    vmov r4, s26
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov r8, s26
+; CHECK-NEXT:    vmov r10, s24
 ; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
-; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    sbcs.w r5, r10, r7
+; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    sbcs.w r5, r2, r7
+; CHECK-NEXT:    vmov r6, s12
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q4[0], r5
-; CHECK-NEXT:    vmov.32 q4[1], r5
-; CHECK-NEXT:    vmov r5, s27
-; CHECK-NEXT:    asrl r8, r5, #31
-; CHECK-NEXT:    vmov.32 q6[0], r4
-; CHECK-NEXT:    rsbs.w r6, r8, #-2147483648
-; CHECK-NEXT:    vmov.32 q6[1], r7
-; CHECK-NEXT:    sbcs.w r6, r10, r5
-; CHECK-NEXT:    vmov.32 q6[2], r8
-; CHECK-NEXT:    mov.w r6, #0
-; CHECK-NEXT:    vmov.32 q6[3], r5
+; CHECK-NEXT:    vmov r5, s25
+; CHECK-NEXT:    csetm r8, ne
+; CHECK-NEXT:    asrl r10, r5, #31
+; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
+; CHECK-NEXT:    vmov q6[2], q6[0], r10, r4
+; CHECK-NEXT:    sbcs.w r3, r2, r5
+; CHECK-NEXT:    vmov q6[3], q6[1], r5, r7
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #1
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    csetm r6, ne
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r8
+; CHECK-NEXT:    vmov q4[3], q4[1], r3, r8
 ; CHECK-NEXT:    mvn r8, #-2147483648
-; CHECK-NEXT:    vmov.32 q4[2], r6
-; CHECK-NEXT:    vmov.32 q4[3], r6
-; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    vbic q5, q0, q4
 ; CHECK-NEXT:    vand q4, q6, q4
 ; CHECK-NEXT:    vorr q4, q4, q5
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    subs.w r5, r5, r8
-; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov.32 q5[0], r4
-; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    subs.w r4, r4, r8
 ; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
 ; CHECK-NEXT:    subs.w r5, r5, r8
-; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    sbcs r4, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov.32 q5[2], r4
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    vbic q6, q1, q5
 ; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vorr q4, q4, q6
-; CHECK-NEXT:    smull r4, r7, r5, r4
-; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
-; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    sbcs.w r5, r10, r7
-; CHECK-NEXT:    vmov.32 q3[1], r7
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov.32 q5[1], r5
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
-; CHECK-NEXT:    vmov.32 q3[2], r6
-; CHECK-NEXT:    sbcs.w r3, r10, r5
-; CHECK-NEXT:    vmov.32 q3[3], r5
+; CHECK-NEXT:    smull r4, r7, r4, r3
+; CHECK-NEXT:    asrl r4, r7, #31
+; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
+; CHECK-NEXT:    vmov q5[2], q5[0], r6, r4
+; CHECK-NEXT:    sbcs.w r3, r2, r7
+; CHECK-NEXT:    vmov q5[3], q5[1], r5, r7
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q5[2], r3
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vbic q2, q0, q5
-; CHECK-NEXT:    vand q3, q3, q5
-; CHECK-NEXT:    vorr q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    rsbs.w r1, r6, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r2, r5
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    vbic q3, q0, q2
+; CHECK-NEXT:    vand q2, q5, q2
+; CHECK-NEXT:    vorr q2, q2, q3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    subs.w r3, r3, r8
+; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q3[1], r5
 ; CHECK-NEXT:    subs.w r4, r4, r8
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    mov.w r3, #0
@@ -379,7 +367,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
 ; CHECK-NEXT:    vbic q5, q1, q3
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vorr q2, q2, q5
@@ -400,25 +388,25 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:  .LBB1_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r4, [r12], #4
-; CHECK-NEXT:    ldr r5, [r1], #4
-; CHECK-NEXT:    smull r4, r5, r5, r4
-; CHECK-NEXT:    asrl r4, r5, #31
-; CHECK-NEXT:    subs r6, r3, r4
-; CHECK-NEXT:    sbcs.w r6, r0, r5
-; CHECK-NEXT:    mov.w r6, #0
+; CHECK-NEXT:    ldr r1, [r12], #4
+; CHECK-NEXT:    ldr r4, [r9], #4
+; CHECK-NEXT:    smull r4, r1, r4, r1
+; CHECK-NEXT:    asrl r4, r1, #31
+; CHECK-NEXT:    subs r5, r3, r4
+; CHECK-NEXT:    sbcs.w r5, r0, r1
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #1
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    movlt r5, #1
+; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csel r4, r4, r3, ne
-; CHECK-NEXT:    csel r5, r5, r0, ne
-; CHECK-NEXT:    subs r6, r4, r2
-; CHECK-NEXT:    sbcs r5, r5, #0
-; CHECK-NEXT:    csel r4, r4, r2, lt
-; CHECK-NEXT:    str r4, [r11], #4
+; CHECK-NEXT:    csel r1, r1, r0, ne
+; CHECK-NEXT:    subs r5, r4, r2
+; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    csel r1, r4, r2, lt
+; CHECK-NEXT:    str r1, [r11], #4
 ; CHECK-NEXT:    le lr, .LBB1_7
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -515,10 +503,8 @@ for.body:                                         ; preds = %for.body.preheader2
 define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_4t_q31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
@@ -527,19 +513,19 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    beq.w .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r7, r3, #3
-; CHECK-NEXT:    adr r4, .LCPI2_1
-; CHECK-NEXT:    bic r7, r7, #3
 ; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    bic r7, r7, #3
+; CHECK-NEXT:    adr r4, .LCPI2_1
 ; CHECK-NEXT:    subs r7, #4
+; CHECK-NEXT:    adr r5, .LCPI2_2
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    adr r4, .LCPI2_2
-; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    vldrw.u32 q3, [r5]
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:    adr r6, .LCPI2_0
 ; CHECK-NEXT:    subs r7, r3, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    vldrw.u32 q3, [r4]
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    mvn r8, #-2147483648
@@ -560,66 +546,62 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.f32 s30, s23
 ; CHECK-NEXT:    vmullb.s32 q0, q7, q6
 ; CHECK-NEXT:    vmov.f32 s18, s17
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    vmov r6, s2
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    vmov r7, s3
-; CHECK-NEXT:    rsbs.w r4, r6, #-2147483648
-; CHECK-NEXT:    vmov.32 q7[0], r6
-; CHECK-NEXT:    sbcs.w r4, r12, r5
-; CHECK-NEXT:    vmov.32 q7[1], r5
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    vmov.f32 s22, s21
+; CHECK-NEXT:    sbcs.w r7, r12, r5
+; CHECK-NEXT:    mov.w r7, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov.32 q6[0], r4
-; CHECK-NEXT:    vmov.32 q6[1], r4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    movlt r7, #1
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    vmov r7, s1
+; CHECK-NEXT:    csetm r10, ne
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov.32 q7[2], r4
+; CHECK-NEXT:    vmov q7[2], q7[0], r4, r6
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
-; CHECK-NEXT:    vmov.32 q7[3], r7
+; CHECK-NEXT:    vmov q7[3], q7[1], r7, r5
 ; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    vmov r7, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q6[2], r3
-; CHECK-NEXT:    vmov.32 q6[3], r3
-; CHECK-NEXT:    vbic q0, q2, q6
-; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vorr q6, q6, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r10
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r10
+; CHECK-NEXT:    vbic q6, q2, q0
+; CHECK-NEXT:    vand q0, q7, q0
+; CHECK-NEXT:    vorr q6, q0, q6
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vmov r3, s25
+; CHECK-NEXT:    vmov r5, s26
 ; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    vmov r4, s27
 ; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    vmov r4, s26
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    subs.w r5, r5, r8
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r4
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r4, s22
 ; CHECK-NEXT:    vbic q7, q3, q0
 ; CHECK-NEXT:    vand q0, q6, q0
 ; CHECK-NEXT:    vorr q6, q0, q7
 ; CHECK-NEXT:    smull r6, r5, r4, r3
-; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
 ; CHECK-NEXT:    sbcs.w r3, r12, r5
@@ -627,49 +609,43 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q7[0], r3
-; CHECK-NEXT:    vmov.32 q7[1], r3
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov.32 q4[0], r6
-; CHECK-NEXT:    vmov.32 q4[1], r5
-; CHECK-NEXT:    smull r4, r7, r4, r3
+; CHECK-NEXT:    csetm r10, ne
+; CHECK-NEXT:    smull r4, r7, r7, r4
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r6
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
-; CHECK-NEXT:    vmov.32 q4[3], r7
+; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q7[2], r3
-; CHECK-NEXT:    vmov.32 q7[3], r3
-; CHECK-NEXT:    vbic q0, q2, q7
-; CHECK-NEXT:    vand q4, q4, q7
-; CHECK-NEXT:    vorr q4, q4, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r10
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r10
+; CHECK-NEXT:    vbic q4, q2, q0
+; CHECK-NEXT:    vand q0, q5, q0
+; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    vmov r4, s18
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    subs.w r5, r5, r8
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r4
 ; CHECK-NEXT:    vbic q5, q3, q0
 ; CHECK-NEXT:    vand q0, q4, q0
 ; CHECK-NEXT:    vorr q0, q0, q5
@@ -682,8 +658,7 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  .LBB2_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI2_0:
@@ -775,33 +750,31 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r9, [r0]
+; CHECK-NEXT:    ldrd r4, r7, [r0]
 ; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    ldrd r5, r10, [r1]
 ; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    umull r4, r5, r5, r4
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    subs.w r6, r4, #-1
-; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    umull r6, r7, r10, r7
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q0[0], r5
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    umull r6, r5, r10, r9
-; CHECK-NEXT:    lsrl r6, r5, #31
-; CHECK-NEXT:    subs.w r7, r6, #-1
-; CHECK-NEXT:    vmov.32 q1[2], r6
-; CHECK-NEXT:    sbcs r5, r5, #0
+; CHECK-NEXT:    lsrl r6, r7, #31
+; CHECK-NEXT:    csetm r9, ne
+; CHECK-NEXT:    subs.w r5, r6, #-1
+; CHECK-NEXT:    vmov.32 q0[1], r9
+; CHECK-NEXT:    sbcs r5, r7, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r6
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q0[2], r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r9, r5
 ; CHECK-NEXT:    vand q1, q1, q0
 ; CHECK-NEXT:    vorn q0, q1, q0
 ; CHECK-NEXT:    vmov r4, s2
@@ -906,8 +879,10 @@ for.body:                                         ; preds = %for.body.preheader,
 define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: usatmul_4_q31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    cmp r3, #0
@@ -943,57 +918,53 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov r5, s17
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    lsrl r4, r5, #31
-; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    vmov r7, s19
 ; CHECK-NEXT:    subs.w r6, r4, #-1
-; CHECK-NEXT:    vmov.32 q3[0], r4
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    vmov r6, s18
 ; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q1[0], r5
-; CHECK-NEXT:    vmov.32 q1[1], r5
-; CHECK-NEXT:    vmov r5, s19
-; CHECK-NEXT:    lsrl r6, r5, #31
-; CHECK-NEXT:    subs.w r7, r6, #-1
-; CHECK-NEXT:    vmov.32 q3[2], r6
-; CHECK-NEXT:    sbcs r5, r5, #0
+; CHECK-NEXT:    csetm r11, ne
+; CHECK-NEXT:    subs.w r5, r6, #-1
+; CHECK-NEXT:    sbcs r5, r7, #0
+; CHECK-NEXT:    vmov.32 q1[1], r11
 ; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r6
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov q1[2], q1[0], r11, r5
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vorn q1, q3, q1
 ; CHECK-NEXT:    vmullb.u32 q3, q2, q0
 ; CHECK-NEXT:    vmov r5, s13
 ; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    lsrl r4, r5, #31
+; CHECK-NEXT:    vmov r7, s15
 ; CHECK-NEXT:    subs.w r6, r4, #-1
-; CHECK-NEXT:    vmov.32 q2[0], r4
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q0[0], r5
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    lsrl r6, r5, #31
-; CHECK-NEXT:    subs.w r7, r6, #-1
-; CHECK-NEXT:    vmov.32 q2[2], r6
-; CHECK-NEXT:    sbcs r5, r5, #0
+; CHECK-NEXT:    csetm r11, ne
+; CHECK-NEXT:    subs.w r5, r6, #-1
+; CHECK-NEXT:    sbcs r5, r7, #0
+; CHECK-NEXT:    vmov.32 q0[1], r11
 ; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q0[2], r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r11, r5
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vorn q0, q2, q0
 ; CHECK-NEXT:    vmov.f32 s1, s2
@@ -1021,7 +992,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    le lr, .LBB4_7
 ; CHECK-NEXT:  .LBB4_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
@@ -1591,12 +1563,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_8t_q15:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB9_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
@@ -1607,107 +1579,99 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    sub.w r12, r12, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    vmov.i8 q2, #0x0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
+; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    adds r3, #8
-; CHECK-NEXT:    vorr q5, q0, q5
-; CHECK-NEXT:    vorr q0, q0, q4
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q6, r5
+; CHECK-NEXT:    adds r5, #8
+; CHECK-NEXT:    vorr q5, q6, q0
+; CHECK-NEXT:    vorr q6, q6, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
+; CHECK-NEXT:    vpsel q7, q3, q2
+; CHECK-NEXT:    vcmp.u32 cs, q1, q6
+; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    vpsel q6, q3, q2
-; CHECK-NEXT:    vcmp.u32 cs, q1, q0
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vpsel q0, q3, q2
 ; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    vmov r4, s29
 ; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r4, s30
 ; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s27
+; CHECK-NEXT:    vmov r4, s31
 ; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov r4, s25
 ; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r4, s26
 ; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r4, s27
 ; CHECK-NEXT:    vmov.16 q5[7], r4
 ; CHECK-NEXT:    vpt.i16 ne, q5, zr
 ; CHECK-NEXT:    vldrht.u16 q6, [r0], #16
-; CHECK-NEXT:    vmov.u16 r4, q6[0]
+; CHECK-NEXT:    vmov.u16 r4, q6[2]
+; CHECK-NEXT:    vmov.u16 r3, q6[0]
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
+; CHECK-NEXT:    vmov.u16 r3, q6[3]
+; CHECK-NEXT:    vmov.u16 r4, q6[1]
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q7, [r1], #16
-; CHECK-NEXT:    vmov.32 q5[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[1]
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[2]
-; CHECK-NEXT:    vmov.32 q5[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[3]
-; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov q5[3], q5[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q7[2]
 ; CHECK-NEXT:    vmov.u16 r4, q7[0]
-; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q7[3]
 ; CHECK-NEXT:    vmov.u16 r4, q7[1]
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q7[2]
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q7[3]
-; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r4, q6[4]
 ; CHECK-NEXT:    vmullb.s16 q0, q0, q5
 ; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[4]
-; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.16 q5[0], r3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov.16 q5[1], r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.16 q5[2], r3
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov.16 q5[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q6[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q6[7]
 ; CHECK-NEXT:    vmov.u16 r4, q6[5]
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[6]
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q6[7]
-; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q7[6]
 ; CHECK-NEXT:    vmov.u16 r4, q7[4]
-; CHECK-NEXT:    vmov.32 q6[0], r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q7[7]
 ; CHECK-NEXT:    vmov.u16 r4, q7[5]
-; CHECK-NEXT:    vmov.32 q6[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q7[6]
-; CHECK-NEXT:    vmov.32 q6[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q7[7]
-; CHECK-NEXT:    vmov.32 q6[3], r4
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
 ; CHECK-NEXT:    vmullb.s16 q0, q6, q0
 ; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov.16 q5[7], r4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.16 q5[4], r3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov.16 q5[5], r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.16 q5[6], r3
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov.16 q5[7], r3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q5, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB9_2
 ; CHECK-NEXT:  .LBB9_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI9_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index 7313cb66c9c9..433656058146 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -34,67 +34,64 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: sadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov lr, s4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov lr, s6
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r1, gt
 ; CHECK-NEXT:    cmp.w r2, #-1
 ; CHECK-NEXT:    cset r3, gt
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r12, eq
 ; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adcs r2, r0
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r0, gt
-; CHECK-NEXT:    cmp r3, r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    and.w r0, r0, r12
-; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    and r3, r0, #1
-; CHECK-NEXT:    cset r0, mi
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    cinv r0, r12, eq
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r1, r2, #31
-; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    cmp.w r0, #-1
-; CHECK-NEXT:    cset r1, gt
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r3, gt
-; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    cset lr, eq
-; CHECK-NEXT:    adds r1, r1, r4
 ; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r2, gt
 ; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    and.w r2, r2, lr
-; CHECK-NEXT:    ands r2, r2, #1
+; CHECK-NEXT:    and.w r2, r2, r12
+; CHECK-NEXT:    ands r12, r2, #1
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r1, r0, #31
+; CHECK-NEXT:    cmp.w r3, #-1
+; CHECK-NEXT:    cset lr, gt
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r4, gt
+; CHECK-NEXT:    cmp r4, lr
+; CHECK-NEXT:    cset lr, eq
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    mvn r6, #-2147483648
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r3, gt
+; CHECK-NEXT:    cmp r4, r3
+; CHECK-NEXT:    cset r3, ne
+; CHECK-NEXT:    and.w r3, r3, lr
+; CHECK-NEXT:    ands r3, r3, #1
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    asrne r5, r2, #31
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r1
 ; CHECK-NEXT:    cset r1, mi
 ; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r12, eq
-; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cinv r1, r6, eq
+; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r1, mi
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    cinv r1, r6, eq
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csel r1, r1, r2, ne
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -133,34 +130,36 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: uadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    adcs r1, r12, #0
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r5, s0
 ; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    adcs r1, r12, #0
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    adcs lr, r12, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    adds r4, r4, r5
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adcs r3, r12, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r4, #-1
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r1, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -200,67 +199,64 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: ssub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov lr, s4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov lr, s6
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r1, gt
 ; CHECK-NEXT:    cmp.w r2, #-1
 ; CHECK-NEXT:    cset r3, gt
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r12, ne
 ; CHECK-NEXT:    subs.w r1, r1, lr
-; CHECK-NEXT:    sbcs r2, r0
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r0, gt
-; CHECK-NEXT:    cmp r3, r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    and.w r0, r0, r12
-; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    and r3, r0, #1
-; CHECK-NEXT:    cset r0, mi
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    cinv r0, r12, eq
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r1, r2, #31
-; CHECK-NEXT:    csel r0, r0, r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    cmp.w r0, #-1
-; CHECK-NEXT:    cset r1, gt
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r3, gt
-; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    cset lr, ne
-; CHECK-NEXT:    subs r1, r4, r1
 ; CHECK-NEXT:    sbc.w r0, r2, r0
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r2, gt
 ; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    and.w r2, r2, lr
-; CHECK-NEXT:    ands r2, r2, #1
+; CHECK-NEXT:    and.w r2, r2, r12
+; CHECK-NEXT:    ands r12, r2, #1
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r1, r0, #31
+; CHECK-NEXT:    cmp.w r3, #-1
+; CHECK-NEXT:    cset lr, gt
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r4, gt
+; CHECK-NEXT:    cmp r4, lr
+; CHECK-NEXT:    cset lr, ne
+; CHECK-NEXT:    subs r5, r6, r5
+; CHECK-NEXT:    sbcs r2, r3
+; CHECK-NEXT:    mvn r6, #-2147483648
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r3, gt
+; CHECK-NEXT:    cmp r4, r3
+; CHECK-NEXT:    cset r3, ne
+; CHECK-NEXT:    and.w r3, r3, lr
+; CHECK-NEXT:    ands r3, r3, #1
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    asrne r5, r2, #31
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r1
 ; CHECK-NEXT:    cset r1, mi
 ; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r12, eq
-; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cinv r1, r6, eq
+; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r1, mi
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    cinv r1, r6, eq
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csel r1, r1, r2, ne
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %0 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -299,36 +295,38 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: usub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    subs r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    sbcs.w r0, r1, r0
-; CHECK-NEXT:    adc r1, r12, #0
-; CHECK-NEXT:    rsbs.w r1, r1, #1
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    movne r0, #0
-; CHECK-NEXT:    movne r2, #0
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r5, s0
 ; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    sbcs.w r0, r1, r0
 ; CHECK-NEXT:    adc r1, r12, #0
-; CHECK-NEXT:    rsbs.w r1, r1, #1
+; CHECK-NEXT:    rsbs.w lr, r1, #1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r2, #0
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    subs r4, r5, r4
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    adc r3, r12, #0
+; CHECK-NEXT:    rsbs.w r3, r3, #1
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne r4, #0
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne r1, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index 4514de457859..4840e4acf01b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -64,9 +64,8 @@ define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x
 ; CHECK-NEXT:    ldrb r2, [r1]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
 ; CHECK-NEXT:    ldrb r1, [r1, #1]
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.32 q2[2], r1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    strb r2, [r0, r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll
index 26f524d7aed8..4bec492b5ac9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -62,15 +62,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i32(<2 x i64> %m) {
 ; CHECK-LABEL: sext_v2i64_v2i64_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %shl = shl <2 x i64> %m, <i64 32, i64 32>
@@ -81,16 +78,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i35(<2 x i64> %m) {
 ; CHECK-LABEL: sext_v2i64_v2i64_v2i35:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    sbfx r0, r0, #0, #3
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbfx r0, r0, #0, #3
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    sbfx r1, r1, #0, #3
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -177,24 +172,20 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @sext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q2, q1
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmovlb.s16 q2, q1
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -207,46 +198,38 @@ define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vmovlb.s16 q4, q1
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmovlb.s8 q0, q3
 ; CHECK-NEXT:    vmovlb.s16 q3, q0
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -258,15 +241,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i32_v2i64(<2 x i32> %src) {
 ; CHECK-LABEL: sext_v2i32_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sext <2 x i32> %src to <2 x i64>
@@ -352,24 +332,20 @@ entry:
 define arm_aapcs_vfpcc <8 x i32> @zext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q2, q1
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmovlb.u16 q2, q1
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -382,44 +358,36 @@ define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vand q4, q1, q3
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.i32 q3, #0xff
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vand q4, q1, q3
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vand q3, q5, q3
+; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
 ; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
+; CHECK-NEXT:    vand q3, q5, q3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
index 8243e0df2059..6b095a39625f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
@@ -34,19 +34,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shl_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    lsll r2, r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov r12, s4
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    lsll r0, r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = shl <2 x i64> %src1, %src2
@@ -90,22 +87,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shru_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    lsll r0, r1, r2
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    lsll r0, r1, r2
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    lsll r0, r5, r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    lsll r2, r3, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
+; CHECK-NEXT:    pop {r5, pc}
 entry:
   %0 = lshr <2 x i64> %src1, %src2
   ret <2 x i64> %0
@@ -148,19 +144,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shrs_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    asrl r2, r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrl r2, r1, r0
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov r12, s4
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    asrl r0, r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = ashr <2 x i64> %src1, %src2
@@ -201,17 +194,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shl_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsll r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    lsll r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    lsll r2, r3, #4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = shl <2 x i64> %src1, <i64 4, i64 4>
@@ -252,17 +242,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shru_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsrl r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    lsrl r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    lsrl r2, r3, #4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = lshr <2 x i64> %src1, <i64 4, i64 4>
@@ -303,17 +290,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qi_int64_t(<2 x i64> %src1) {
 ; CHECK-LABEL: shrs_qi_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    asrl r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    asrl r0, r1, #4
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    asrl r2, r3, #4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = ashr <2 x i64> %src1, <i64 4, i64 4>
@@ -360,17 +344,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shl_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shl_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    lsll r12, r1, r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    lsll r2, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -422,18 +403,15 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shru_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    rsb.w r12, r0, #0
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    lsll r2, r1, r12
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    lsll r0, r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -485,17 +463,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shrs_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shrs_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    asrl r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    asrl r2, r1, r0
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    asrl r12, r1, r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    asrl r2, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index dc04c5e75837..d6e25d2a6864 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -49,10 +49,8 @@ define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = add nsw <2 x i64> %src1, %src2
@@ -200,10 +198,8 @@ define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = sub nsw <2 x i64> %src2, %src1
@@ -339,24 +335,22 @@ define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    umull r12, r3, r1, r0
-; CHECK-NEXT:    mla lr, r1, r2, r3
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    umull r12, r3, r1, r0
+; CHECK-NEXT:    mla lr, r1, r2, r3
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    umull r4, r5, r1, r3
 ; CHECK-NEXT:    mla r1, r1, r2, r5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    mla r0, r2, r0, lr
 ; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov.32 q0[0], r12
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.32 q0[2], r4
+; CHECK-NEXT:    mla r0, r2, r0, lr
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r12
 ; CHECK-NEXT:    mla r1, r2, r3, r1
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = mul nsw <2 x i64> %src1, %src2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index 37ca5a2f2020..427d7f46ea4b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -70,11 +70,9 @@ define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK-FP-NEXT:    vmov r2, s1
 ; CHECK-FP-NEXT:    vmov r0, s5
 ; CHECK-FP-NEXT:    adds r1, r1, r3
-; CHECK-FP-NEXT:    vmov.32 q0[0], r1
+; CHECK-FP-NEXT:    vmov q0[2], q0[0], r1, lr
 ; CHECK-FP-NEXT:    adcs r0, r2
-; CHECK-FP-NEXT:    vmov.32 q0[1], r0
-; CHECK-FP-NEXT:    vmov.32 q0[2], lr
-; CHECK-FP-NEXT:    vmov.32 q0[3], r12
+; CHECK-FP-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1
 ; CHECK-FP-NEXT:    pop {r7, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index cb82f9020d34..c69b7cce8c73 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -123,23 +123,20 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
 define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_s16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
 ; CHECK-NEXT:    vabs.s32 q3, q2
@@ -151,23 +148,18 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q1, q3
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmovlb.s16 q1, q3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmovlb.s16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
 ; CHECK-NEXT:    vabs.s32 q0, q0
@@ -194,47 +186,46 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vabd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    sbc.w r2, r2, r3, asr #31
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    add.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    eor.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    subs r0, r0, r2
-; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    add.w r0, r0, r1, asr #31
-; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    sbc.w r2, r2, r3, asr #31
+; CHECK-NEXT:    add.w r1, r1, r2, asr #31
+; CHECK-NEXT:    eor.w r1, r1, r2, asr #31
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -368,23 +359,20 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
 define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_u16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    vmovlb.u16 q2, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmovlb.u16 q3, q3
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
 ; CHECK-NEXT:    vabs.s32 q3, q2
@@ -396,23 +384,18 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q1, q3
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmovlb.u16 q1, q3
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmovlb.u16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
 ; CHECK-NEXT:    vabs.s32 q0, q0
@@ -441,56 +424,51 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s16, s0
-; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vmov.f32 s18, s1
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vand q4, q4, q3
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vand q2, q2, q4
+; CHECK-NEXT:    vand q3, q3, q4
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov r1, s13
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vand q4, q0, q4
 ; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    sbc.w r0, r1, r0
 ; CHECK-NEXT:    add.w r1, r2, r0, asr #31
 ; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vand q1, q4, q3
-; CHECK-NEXT:    vmov.f32 s16, s2
-; CHECK-NEXT:    vmov.f32 s18, s3
-; CHECK-NEXT:    vand q0, q4, q3
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    sbc.w r1, r2, r1
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.32 q2[1], r12
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    sbc.w r0, r1, r0
-; CHECK-NEXT:    add.w r1, r2, r0, asr #31
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s15
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r12
+; CHECK-NEXT:    vmov r0, s11
 ; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    sbc.w r0, r1, r0
 ; CHECK-NEXT:    add.w r1, r2, r0, asr #31
-; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov r2, s19
+; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    subs r0, r0, r3
+; CHECK-NEXT:    sbc.w r1, r2, r1
+; CHECK-NEXT:    add.w r0, r0, r1, asr #31
+; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
@@ -618,8 +596,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 ; CHECK-LABEL: vabd_loop_s32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
@@ -650,37 +628,34 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    subs.w r9, r5, r7
 ; CHECK-NEXT:    asr.w r6, r5, #31
-; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r5, s12
 ; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
-; CHECK-NEXT:    vmov.32 q1[0], r8
-; CHECK-NEXT:    vmov.32 q1[1], r9
 ; CHECK-NEXT:    and.w r6, r12, r6, asr #31
 ; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    bfi r4, r6, #4, #4
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    asrs r7, r6, #31
-; CHECK-NEXT:    subs r6, r6, r3
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    subs.w r10, r6, r3
+; CHECK-NEXT:    asr.w r7, r6, #31
 ; CHECK-NEXT:    sbc.w r3, r7, r3, asr #31
-; CHECK-NEXT:    vmov.32 q1[2], r6
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    asrs r7, r3, #31
-; CHECK-NEXT:    subs r3, r3, r5
-; CHECK-NEXT:    sbc.w r5, r7, r5, asr #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    asrs r7, r5, #31
-; CHECK-NEXT:    and.w r5, r12, r5, asr #31
-; CHECK-NEXT:    vmov.32 q2[2], r7
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov r7, s8
-; CHECK-NEXT:    and r7, r7, #1
-; CHECK-NEXT:    rsbs r7, r7, #0
-; CHECK-NEXT:    bfi r4, r7, #8, #4
-; CHECK-NEXT:    bfi r4, r5, #12, #4
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    asrs r6, r5, #31
+; CHECK-NEXT:    asr.w r11, r3, #31
+; CHECK-NEXT:    and.w r3, r12, r3, asr #31
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    subs r5, r5, r7
+; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
+; CHECK-NEXT:    asrs r6, r6, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r11
+; CHECK-NEXT:    vmov r6, s4
+; CHECK-NEXT:    vmov q1[2], q1[0], r8, r5
+; CHECK-NEXT:    vmov q1[3], q1[1], r9, r10
+; CHECK-NEXT:    and r6, r6, #1
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    bfi r4, r6, #8, #4
+; CHECK-NEXT:    bfi r4, r3, #12, #4
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vsubt.i32 q1, q0, q1
@@ -689,7 +664,7 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   br label %vector.body
 
@@ -834,14 +809,15 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 ; CHECK-LABEL: vabd_loop_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:  .LBB11_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -872,37 +848,36 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov.f32 s16, s14
 ; CHECK-NEXT:    vmov.f32 s18, s15
 ; CHECK-NEXT:    vand q3, q4, q0
+; CHECK-NEXT:    vmov r12, s12
 ; CHECK-NEXT:    subs.w r9, r3, r5
-; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r5, s14
 ; CHECK-NEXT:    sbc.w r3, r7, r6
-; CHECK-NEXT:    vmov r7, s8
-; CHECK-NEXT:    vmov r6, s13
-; CHECK-NEXT:    and.w r3, r12, r3, asr #31
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    vmov r6, s15
+; CHECK-NEXT:    and.w r3, r7, r3, asr #31
+; CHECK-NEXT:    vmov r7, s10
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    bfi r4, r3, #4, #4
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    subs.w r10, r5, r7
-; CHECK-NEXT:    vmov r7, s10
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    sbc.w r3, r6, r3
-; CHECK-NEXT:    vmov r6, s15
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q4[0], r3
 ; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    vmov.32 q2[0], r8
-; CHECK-NEXT:    vmov.32 q2[1], r9
-; CHECK-NEXT:    vmov.32 q2[2], r10
-; CHECK-NEXT:    subs r5, r5, r7
-; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    subs.w r10, r5, r7
+; CHECK-NEXT:    vmov r7, s9
+; CHECK-NEXT:    vmov r5, s13
 ; CHECK-NEXT:    sbc.w r3, r6, r3
-; CHECK-NEXT:    asrs r6, r3, #31
-; CHECK-NEXT:    and.w r3, r12, r3, asr #31
-; CHECK-NEXT:    vmov.32 q4[2], r6
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    asr.w r11, r3, #31
+; CHECK-NEXT:    subs.w r6, r12, r6
+; CHECK-NEXT:    sbc.w r7, r5, r7
+; CHECK-NEXT:    asrs r7, r7, #31
+; CHECK-NEXT:    vmov q2[2], q2[0], r7, r11
+; CHECK-NEXT:    vmov r7, s8
+; CHECK-NEXT:    vmov q2[2], q2[0], r8, r6
+; CHECK-NEXT:    vmov q2[3], q2[1], r9, r10
+; CHECK-NEXT:    and r7, r7, #1
+; CHECK-NEXT:    rsbs r7, r7, #0
+; CHECK-NEXT:    bfi r4, r7, #8, #4
+; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    and.w r3, r7, r3, asr #31
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    and r6, r6, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    bfi r4, r6, #8, #4
 ; CHECK-NEXT:    bfi r4, r3, #12, #4
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpst
@@ -911,7 +886,8 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    le lr, .LBB11_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   br label %vector.body
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
index a40beb4b0eba..aeffc39b26f8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
@@ -367,36 +367,31 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vbic q0, q3, q4
-; CHECK-NEXT:    vand q1, q2, q4
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q1, q3, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, %srcb
@@ -407,36 +402,31 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vbic q0, q3, q4
-; CHECK-NEXT:    vand q1, q2, q4
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    eors r2, r3
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q1, q3, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, %srcb
@@ -447,84 +437,76 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vmov lr, s0
-; CHECK-NEXT:    subs.w r1, lr, r2
-; CHECK-NEXT:    asr.w r12, lr, #31
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q0, q2, q0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    subs r1, r0, r2
+; CHECK-NEXT:    asr.w r12, r0, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    subs r0, r1, r2
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    csetm lr, ne
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    subs r4, r2, r1
+; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q1, q5, q4
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
index 06361d952120..a8f04d389e02 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
@@ -433,29 +433,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r0, r3
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -468,29 +466,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r0, r3
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -503,84 +499,76 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vmov lr, s0
-; CHECK-NEXT:    subs.w r1, lr, r2
-; CHECK-NEXT:    asr.w r12, lr, #31
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q0, q2, q0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    subs r1, r0, r2
+; CHECK-NEXT:    asr.w r12, r0, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    subs r0, r1, r2
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    csetm lr, ne
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    subs r4, r2, r1
+; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q1, q5, q4
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer
@@ -1026,29 +1014,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_r_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r0, r3
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -1061,29 +1047,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_r_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    eors r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r0, r3
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -1096,84 +1080,76 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_r_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vmov lr, s0
-; CHECK-NEXT:    subs.w r1, lr, r2
-; CHECK-NEXT:    asr.w r12, lr, #31
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q0, q2, q0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    subs r1, r0, r2
+; CHECK-NEXT:    asr.w r12, r0, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    subs r0, r1, r2
-; CHECK-NEXT:    asr.w r12, r1, #31
-; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    csetm lr, ne
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    subs r4, r2, r1
+; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q1, q5, q4
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, ne
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
index e9b717494234..78df48775ebe 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
@@ -361,25 +361,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eqz_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer
@@ -390,25 +388,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eqz_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer
@@ -779,25 +775,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_r_eqz_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> zeroinitializer, %src
@@ -808,25 +802,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_r_eqz_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vbic q0, q2, q3
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vbic q2, q2, q0
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
index e408bc46b47a..63e817f1e8eb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
@@ -4,10 +4,8 @@
 define arm_aapcs_vfpcc <4 x i32> @vcreate_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: vcreate_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
   %conv = zext i32 %a to i64
@@ -27,10 +25,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0123(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -43,10 +39,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_3210(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_3210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -59,10 +53,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0213(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -75,8 +67,7 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0220(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0220:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -89,9 +80,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_321(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_321:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -104,8 +94,7 @@ define arm_aapcs_vfpcc <4 x i32> @insert_310(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_310:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -117,8 +106,7 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_320(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_320:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
 ; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -131,8 +119,7 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_31(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -165,9 +152,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_210(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 2
@@ -179,8 +165,7 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_20(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_20:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 2
@@ -245,28 +230,26 @@ entry:
 define hidden <8 x i16> @create_i16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d, i16 zeroext %a2, i16 zeroext %b2, i16 zeroext %c2, i16 zeroext %d2) local_unnamed_addr #0 {
 ; CHECK-LABEL: create_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #16
-; CHECK-NEXT:    lsll r2, r5, #16
-; CHECK-NEXT:    ldrd lr, r4, [sp, #16]
-; CHECK-NEXT:    orr.w r1, r2, r3
+; CHECK-NEXT:    .save {r5, r7, r9, lr}
+; CHECK-NEXT:    push.w {r5, r7, r9, lr}
 ; CHECK-NEXT:    ldr.w r12, [sp, #24]
-; CHECK-NEXT:    orrs r0, r5
-; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    ldr r0, [sp, #28]
+; CHECK-NEXT:    ldr r5, [sp, #28]
+; CHECK-NEXT:    lsll r2, r9, #16
 ; CHECK-NEXT:    lsll r12, r7, #16
-; CHECK-NEXT:    orr.w r4, r4, lr, lsl #16
-; CHECK-NEXT:    orr.w r0, r0, r12
-; CHECK-NEXT:    orrs r7, r4
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r7
+; CHECK-NEXT:    orr.w r5, r5, r12
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
+; CHECK-NEXT:    ldrd r2, r3, [sp, #16]
+; CHECK-NEXT:    orr.w r0, r1, r0, lsl #16
+; CHECK-NEXT:    orr.w r0, r0, r9
+; CHECK-NEXT:    orr.w r2, r3, r2, lsl #16
+; CHECK-NEXT:    orrs r2, r7
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop.w {r5, r7, r9, pc}
 entry:
   %conv = zext i16 %a to i64
   %shl = shl nuw i64 %conv, 48
@@ -325,59 +308,59 @@ entry:
 define hidden <16 x i8> @create_i8(i8 zeroext %a1, i8 zeroext %b1, i8 zeroext %c1, i8 zeroext %d1, i8 zeroext %a2, i8 zeroext %b2, i8 zeroext %c2, i8 zeroext %d2, i8 zeroext %a3, i8 zeroext %b3, i8 zeroext %c3, i8 zeroext %d3, i8 zeroext %a4, i8 zeroext %b4, i8 zeroext %c4, i8 zeroext %d4) local_unnamed_addr #0 {
 ; CHECK-LABEL: create_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
-; CHECK-NEXT:    ldr.w r12, [sp, #28]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    ldr r4, [sp, #68]
 ; CHECK-NEXT:    mov.w r11, #0
-; CHECK-NEXT:    ldr r4, [sp, #24]
+; CHECK-NEXT:    ldr r6, [sp, #64]
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    lsll r12, r11, #16
-; CHECK-NEXT:    lsls r1, r1, #16
-; CHECK-NEXT:    lsll r4, r5, #24
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #22
-; CHECK-NEXT:    orr.w r12, r12, r4
-; CHECK-NEXT:    ldr r4, [sp, #32]
+; CHECK-NEXT:    lsll r4, r11, #16
+; CHECK-NEXT:    mov lr, r1
+; CHECK-NEXT:    lsll r6, r5, #24
 ; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    orr.w r0, r0, r2, lsl #8
+; CHECK-NEXT:    orr.w r1, r6, r4
+; CHECK-NEXT:    ldr r4, [sp, #72]
+; CHECK-NEXT:    mov r12, r3
+; CHECK-NEXT:    ldr r3, [sp, #76]
 ; CHECK-NEXT:    lsll r4, r7, #8
-; CHECK-NEXT:    add r0, r3
-; CHECK-NEXT:    orr.w r12, r12, r4
-; CHECK-NEXT:    ldr r4, [sp, #36]
-; CHECK-NEXT:    orrs r0, r5
-; CHECK-NEXT:    ldr r2, [sp, #56]
-; CHECK-NEXT:    orr.w r0, r0, r11
-; CHECK-NEXT:    orr.w r4, r4, r12
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    orrs r0, r7
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    ldr r0, [sp, #60]
-; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    ldr r6, [sp, #36]
+; CHECK-NEXT:    orrs r1, r4
+; CHECK-NEXT:    ldr r4, [sp, #32]
+; CHECK-NEXT:    orr.w r8, r1, r3
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    lsll r0, r1, #16
-; CHECK-NEXT:    lsll r2, r3, #24
-; CHECK-NEXT:    orrs r0, r2
-; CHECK-NEXT:    ldr r2, [sp, #64]
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r6, r3, #16
+; CHECK-NEXT:    lsll r4, r1, #24
 ; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    lsll r2, r9, #8
-; CHECK-NEXT:    orrs r0, r2
-; CHECK-NEXT:    ldr r2, [sp, #68]
-; CHECK-NEXT:    orrs r0, r2
-; CHECK-NEXT:    ldr r2, [sp, #40]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    ldr r0, [sp, #44]
-; CHECK-NEXT:    lsls r0, r0, #16
-; CHECK-NEXT:    orr.w r0, r0, r2, lsl #22
-; CHECK-NEXT:    ldr r2, [sp, #48]
+; CHECK-NEXT:    orrs r4, r6
+; CHECK-NEXT:    ldr r6, [sp, #40]
+; CHECK-NEXT:    lsll r6, r9, #8
+; CHECK-NEXT:    orrs r4, r6
+; CHECK-NEXT:    ldr r6, [sp, #44]
+; CHECK-NEXT:    orrs r4, r6
+; CHECK-NEXT:    ldr r6, [sp, #48]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r8
+; CHECK-NEXT:    ldr r4, [sp, #52]
+; CHECK-NEXT:    lsls r4, r4, #16
+; CHECK-NEXT:    orr.w r4, r4, r6, lsl #22
+; CHECK-NEXT:    ldr r6, [sp, #56]
+; CHECK-NEXT:    orr.w r4, r4, r6, lsl #8
+; CHECK-NEXT:    ldr r6, [sp, #60]
+; CHECK-NEXT:    add r4, r6
+; CHECK-NEXT:    orrs r4, r5
+; CHECK-NEXT:    orr.w r4, r4, r11
+; CHECK-NEXT:    orrs r4, r7
+; CHECK-NEXT:    lsl.w r7, lr, #16
+; CHECK-NEXT:    orr.w r0, r7, r0, lsl #22
 ; CHECK-NEXT:    orr.w r0, r0, r2, lsl #8
-; CHECK-NEXT:    ldr r2, [sp, #52]
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    orrs r0, r3
+; CHECK-NEXT:    add r0, r12
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    orrs r0, r3
 ; CHECK-NEXT:    orr.w r0, r0, r9
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
 entry:
   %conv = zext i8 %a1 to i64
   %shl = shl nuw nsw i64 %conv, 54

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 831ca0499333..2b165881badf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -43,18 +43,16 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_int32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s1
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s0
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s8, s3
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s1
 ; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov.32 q0[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s6
-; CHECK-MVE-NEXT:    vmov.32 q0[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s10
-; CHECK-MVE-NEXT:    vmov.32 q0[2], r0
+; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.32 q0[3], r0
+; CHECK-MVE-NEXT:    vmov r1, s10
+; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_int32_float:
@@ -69,18 +67,16 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_uint32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s0
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s1
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s2
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s2
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s0
 ; CHECK-MVE-NEXT:    vcvt.u32.f32 s8, s3
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s1
 ; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov.32 q0[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s6
-; CHECK-MVE-NEXT:    vmov.32 q0[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s10
-; CHECK-MVE-NEXT:    vmov.32 q0[2], r0
+; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.32 q0[3], r0
+; CHECK-MVE-NEXT:    vmov r1, s10
+; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_uint32_float:
@@ -349,24 +345,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @foo_int64_float(<2 x double> %src) {
 ; CHECK-LABEL: foo_int64_float:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, r1, d8
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    vmov r2, r3, d9
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = fptosi <2 x double> %src to <2 x i64>
   ret <2 x i64> %out
@@ -375,24 +368,21 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @foo_uint64_float(<2 x double> %src) {
 ; CHECK-LABEL: foo_uint64_float:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov r0, r1, d8
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_d2ulz
-; CHECK-NEXT:    vmov r2, r3, d9
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2ulz
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %out = fptoui <2 x double> %src to <2 x i64>
   ret <2 x i64> %out

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index bce76f037a78..75147225afb0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -38,10 +38,8 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vdup_i64(i64 %src) {
 ; CHECK-LABEL: vdup_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <2 x i64> undef, i64 %src, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index df2cb4361f2f..995926a1502e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -129,41 +129,37 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-LABEL: add_v8i16_v8i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.i64 q1, #0xffff
-; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s15
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -182,63 +178,57 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ; CHECK-LABEL: add_v8i16_v8i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
 ; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    asrs r3, r1, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r0, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    adds r2, r2, r1
+; CHECK-NEXT:    adc.w r1, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    sxth r3, r0
+; CHECK-NEXT:    adds r0, r2, r3
+; CHECK-NEXT:    adc.w r1, r1, r3, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -394,40 +384,36 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.i64 q1, #0xff
-; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s15
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -436,11 +422,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -449,11 +434,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -462,11 +446,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -475,12 +458,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -499,131 +481,117 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
 ; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    asrs r3, r1, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    asrs r0, r1, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r0, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    adds r2, r2, r1
+; CHECK-NEXT:    adc.w r1, r0, r1, asr #31
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    sxtb r3, r0
+; CHECK-NEXT:    adds r0, r2, r3
+; CHECK-NEXT:    adc.w r1, r1, r3, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -828,41 +796,37 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov.i64 q1, #0xffff
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    add.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    add.w r12, r2, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov lr, s15
+; CHECK-NEXT:    add r12, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r4, r12, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    adc.w r12, r2, lr
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r3, r3, r4
@@ -886,48 +850,42 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r12
 ; CHECK-NEXT:    vmov lr, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r12, s5
 ; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
 ; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adds.w r4, r4, lr
 ; CHECK-NEXT:    adc.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.u16 r4, q0[4]
-; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u16 r4, q0[5]
+; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr
@@ -1118,40 +1076,36 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmov.i64 q1, #0xff
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[1]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    add.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    add.w r12, r2, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov lr, s15
+; CHECK-NEXT:    add r12, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r4, r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    adc.w r12, r2, lr
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -1160,59 +1114,55 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    adc.w r3, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov.32 q2[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adds.w r12, lr, r2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adds.w r12, r12, r2
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w lr, r4, r2
 ; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adds.w r12, r12, r2
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w lr, r4, r2
 ; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adds.w r12, r12, r2
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w lr, r4, r2
 ; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov.32 q2[2], r4
-; CHECK-NEXT:    vand q0, q2, q1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, pc}
@@ -1228,116 +1178,102 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r12
 ; CHECK-NEXT:    vmov lr, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r12, s5
 ; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
 ; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
 ; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adds.w r4, r4, lr
 ; CHECK-NEXT:    adc.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
 ; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[7]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[9]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[11]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
 ; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov.32 q1[1], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[13]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r3, r3, lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index e59fb0bb1ef4..0403bb4781be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -45,23 +45,21 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK-LABEL: add_v2i32_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -80,29 +78,25 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK-LABEL: add_v2i32_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -201,29 +195,24 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q2[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q2[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r1, s15
@@ -232,76 +221,65 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r0, r3
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r0, s13
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds.w r12, r2, r0
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    vmov r1, s15
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
+; CHECK-NEXT:    adc.w r12, r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov.u16 r3, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r0, r1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vand q0, q3, q1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    adcs r1, r2
@@ -325,59 +303,49 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
 ; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q3[3], r1
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r3, r12
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r0, r0, #0
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r0, s9
@@ -387,62 +355,52 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    adds.w r12, r1, r0
 ; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[4]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r3
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r0
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q1[1], r3
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -466,21 +424,19 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -500,32 +456,28 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q2, #0xffff
-; CHECK-NEXT:    vand q2, q1, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -709,29 +661,24 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.16 q1[7], r0
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q5[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[1]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[2]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q5[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[3]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q5[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.32 q6[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q7[2], r1
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r1, s27
@@ -740,76 +687,65 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    vmov r2, s26
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q7[2], r0
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
+; CHECK-NEXT:    vmov q7[2], q7[0], r3, r0
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r0, s25
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    adds.w r12, r2, r0
-; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.32 q6[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[6]
-; CHECK-NEXT:    vmov.32 q6[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[7]
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s26
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    vmov r1, s27
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q5[4]
+; CHECK-NEXT:    adc.w r12, r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q5[6]
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
+; CHECK-NEXT:    vmov.u16 r1, q5[7]
+; CHECK-NEXT:    vmov.u16 r3, q5[5]
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r1
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    and r0, r1, #1
+; CHECK-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[2], r3
-; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r0
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    ubfx r3, r1, #12, #1
+; CHECK-NEXT:    ubfx r1, r1, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov q5[2], q5[0], r1, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r1, r3
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov r1, s21
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    vmov r3, s22
 ; CHECK-NEXT:    adcs r1, r2
@@ -834,50 +770,42 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.16 q5[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
 ; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov r1, s15
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
@@ -887,52 +815,44 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s15
 ; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
 ; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vand q0, q3, q1
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -979,59 +899,49 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.16 q4[7], r0
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q4, q2, q1
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q4[0]
+; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q4[1]
+; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.32 q5[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov.32 q6[3], r1
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r1, s22
-; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmov r1, s20
 ; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r3, r12
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r0, r0, #0
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r0
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r0, s21
@@ -1041,61 +951,51 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov r0, s22
 ; CHECK-NEXT:    adds.w r12, r1, r0
 ; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q4[4]
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q4[5]
-; CHECK-NEXT:    vmov.32 q5[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[6]
-; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q4[4]
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[7]
-; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q4[5]
+; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q4[2], r3
-; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r0
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r3, r0
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q4[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
 ; CHECK-NEXT:    vand q4, q5, q4
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vmov r2, s17
@@ -1123,61 +1023,51 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.16 q4[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
 ; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -1186,63 +1076,53 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    vmov.u16 r3, q1[4]
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
 ; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r0
+; CHECK-NEXT:    asrs r0, r0, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r12, r12, r0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q1[1], r3
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -1267,21 +1147,19 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -1301,32 +1179,28 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vand q2, q1, q2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1345,23 +1219,21 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1425,23 +1297,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1465,29 +1335,25 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1596,29 +1462,24 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsb.w r12, r3, #0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r12
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r12
+; CHECK-NEXT:    vmov.u16 r12, q0[1]
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r12
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r12, s15
@@ -1627,73 +1488,62 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    orr.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    add lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds.w r4, lr, r3
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adc.w lr, r12, r2
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds.w r12, r4, r3
-; CHECK-NEXT:    adc.w lr, lr, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q2[0], r4
-; CHECK-NEXT:    vmov.32 q2[1], r4
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q2[2], r4
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[5]
 ; CHECK-NEXT:    vmov.u16 r4, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adds.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds.w lr, lr, r4
 ; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vand q0, q3, q1
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -1725,134 +1575,114 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r2, r12, #1
+; CHECK-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    sxth r2, r2
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov lr, s11
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds r5, r4, r2
+; CHECK-NEXT:    ubfx r4, r12, #12, #1
+; CHECK-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    adc.w r12, r12, r5
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.u16 r4, q0[2]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    adcs r3, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w r12, r5, r2
+; CHECK-NEXT:    vmov.u16 r5, q1[6]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u16 r4, q1[4]
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
+; CHECK-NEXT:    vmov.u16 r5, q1[7]
+; CHECK-NEXT:    vmov.u16 r4, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r5, r2, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q1[0], r5
-; CHECK-NEXT:    vmov.32 q1[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q1[2], r5
-; CHECK-NEXT:    vmov.32 q1[3], r5
-; CHECK-NEXT:    vmov.u16 r5, q0[4]
-; CHECK-NEXT:    sxth r5, r5
-; CHECK-NEXT:    vmov.32 q2[0], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    vmov.u16 r5, q0[5]
-; CHECK-NEXT:    sxth r5, r5
-; CHECK-NEXT:    vmov.32 q2[2], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    adds.w r12, r12, r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmrs r5, p0
+; CHECK-NEXT:    and r2, r5, #1
+; CHECK-NEXT:    ubfx r4, r5, #4, #1
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q1[1], r3
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.u16 r4, q0[4]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    ubfx r5, r5, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
+; CHECK-NEXT:    vmov.u16 r5, q0[7]
+; CHECK-NEXT:    vmov.u16 r4, q0[6]
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adcs r2, r5
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -1869,21 +1699,19 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orr.w r12, r3, r2
@@ -1908,32 +1736,28 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q2, #0xffff
-; CHECK-NEXT:    vand q2, q1, q2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2105,8 +1929,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
@@ -2131,29 +1955,24 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.16 q1[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q5[0]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[1]
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
-; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[3]
-; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsb.w r12, r3, #0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[2], r3
-; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r12
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r12
+; CHECK-NEXT:    vmov.u8 r12, q0[1]
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    vmov.32 q7[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov.32 q7[2], r3
+; CHECK-NEXT:    vmov q7[2], q7[0], r3, r12
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r12, s27
@@ -2162,81 +1981,71 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    orr.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    add lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.32 q6[2], r2
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov.32 q7[0], r2
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov.32 q7[2], r2
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
+; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    adds.w r4, lr, r3
+; CHECK-NEXT:    adds.w lr, lr, r3
 ; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    adc.w lr, r12, r2
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    adds.w r12, r4, r3
-; CHECK-NEXT:    adc.w lr, lr, r2
-; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov.u16 r3, q5[4]
+; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[6]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[7]
-; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[5]
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q5[0], r4
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q5[2], r4
-; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
 ; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    vmov.32 q6[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    vmov.32 q6[2], r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r4, s21
-; CHECK-NEXT:    adds.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    adds.w lr, lr, r4
 ; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s23
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q5[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    adc.w lr, r12, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    adc.w r4, r12, r2
+; CHECK-NEXT:    vmov r2, s23
+; CHECK-NEXT:    adds.w r12, lr, r3
+; CHECK-NEXT:    adc.w lr, r4, r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -2253,121 +2062,104 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    adc.w lr, lr, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r4, q2[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r4, q2[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r4, r2, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    vmov.32 q3[1], r4
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q3[2], r4
-; CHECK-NEXT:    vmov.32 q3[3], r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r4
+; CHECK-NEXT:    vmov.u8 r3, q0[9]
 ; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    adds.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    adds.w r5, r12, r4
 ; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adds.w r4, r4, r12
 ; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    ubfx r4, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r4
 ; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s15
-; CHECK-NEXT:    adc.w lr, r12, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adds.w r12, r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov.u16 r4, q2[4]
+; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[7]
+; CHECK-NEXT:    vmov.u16 r4, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r4, r2, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q2[0], r4
-; CHECK-NEXT:    vmov.32 q2[1], r4
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    vmrs r3, p0
+; CHECK-NEXT:    and r2, r3, #1
+; CHECK-NEXT:    ubfx r4, r3, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q2[2], r4
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
 ; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    adds.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adc.w lr, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    adc.w r5, r12, r4
+; CHECK-NEXT:    ubfx r4, r3, #12, #1
+; CHECK-NEXT:    ubfx r3, r3, #8, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vand q0, q3, q1
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
+; CHECK-NEXT:    vmov.u8 r3, q0[15]
+; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r3, r5
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -2406,280 +2198,240 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.16 q4[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q4, q2, q1
-; CHECK-NEXT:    vmov.u16 r2, q4[0]
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q4[1]
-; CHECK-NEXT:    vmov.32 q5[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[2]
-; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q4[0]
+; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[3]
-; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q4[1]
+; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r2, r12, #1
+; CHECK-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[2], r3
-; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    asrs r2, r2, #31
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmov r2, s22
 ; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vmov lr, s23
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    adds r5, r4, r2
+; CHECK-NEXT:    ubfx r4, r12, #12, #1
+; CHECK-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    adc.w r12, r12, r5
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r4
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r4
 ; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.u8 r4, q0[2]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r4, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r2
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r4, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov r5, s23
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u16 r2, q4[4]
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q4[5]
-; CHECK-NEXT:    vmov.32 q5[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q4[6]
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q4[7]
-; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    adcs r3, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r5, r2
+; CHECK-NEXT:    vmov.u16 r5, q4[6]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u16 r4, q4[4]
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
+; CHECK-NEXT:    vmov.u16 r5, q4[7]
+; CHECK-NEXT:    vmov.u16 r4, q4[5]
+; CHECK-NEXT:    vmov q5[3], q5[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r5, r2, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q4[0], r5
-; CHECK-NEXT:    vmov.32 q4[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q4[2], r5
-; CHECK-NEXT:    vmov.32 q4[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[4]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q5[1], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    vmrs r5, p0
+; CHECK-NEXT:    and r2, r5, #1
+; CHECK-NEXT:    ubfx r4, r5, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r4
+; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q5[3], q5[1], r4, r2
+; CHECK-NEXT:    vand q4, q5, q4
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    ubfx r5, r5, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
+; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r5, q0[7]
+; CHECK-NEXT:    vmov.u8 r4, q0[6]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q5[3], r5
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q5[3], q5[1], r4, r5
 ; CHECK-NEXT:    vand q4, q5, q4
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vmov r5, s17
-; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q4[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q5[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q5[3], r2
-; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    adcs r3, r5
 ; CHECK-NEXT:    vmov r5, s19
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u8 r2, q3[8]
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[9]
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[10]
-; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[11]
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[12]
-; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[13]
-; CHECK-NEXT:    vmov.16 q4[5], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[14]
-; CHECK-NEXT:    vmov.16 q4[6], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[15]
-; CHECK-NEXT:    vmov.16 q4[7], r2
+; CHECK-NEXT:    adds.w r12, r2, r4
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q3[8]
+; CHECK-NEXT:    vmov.16 q4[0], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[9]
+; CHECK-NEXT:    vmov.16 q4[1], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[10]
+; CHECK-NEXT:    vmov.16 q4[2], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[11]
+; CHECK-NEXT:    vmov.16 q4[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[12]
+; CHECK-NEXT:    vmov.16 q4[4], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[13]
+; CHECK-NEXT:    vmov.16 q4[5], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[14]
+; CHECK-NEXT:    vmov.16 q4[6], r5
+; CHECK-NEXT:    vmov.u8 r5, q3[15]
+; CHECK-NEXT:    vmov.16 q4[7], r5
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r5, q1[2]
+; CHECK-NEXT:    vmov.u16 r4, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
+; CHECK-NEXT:    vmov.u16 r5, q1[3]
+; CHECK-NEXT:    vmov.u16 r4, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r5, r2, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q2[0], r5
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q2[2], r5
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[8]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q3[1], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    vmrs r5, p0
+; CHECK-NEXT:    and r2, r5, #1
+; CHECK-NEXT:    ubfx r4, r5, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
+; CHECK-NEXT:    vand q2, q3, q2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    ubfx r5, r5, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r5, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r5, q0[11]
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q3[3], r5
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s11
+; CHECK-NEXT:    adds.w r12, r2, r4
+; CHECK-NEXT:    vmov.u16 r4, q1[4]
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov.u16 r5, q1[6]
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
+; CHECK-NEXT:    vmov.u16 r5, q1[7]
+; CHECK-NEXT:    vmov.u16 r4, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
+; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmrs r5, p0
+; CHECK-NEXT:    and r2, r5, #1
+; CHECK-NEXT:    ubfx r4, r5, #4, #1
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r5, r2, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q1[0], r5
-; CHECK-NEXT:    vmov.32 q1[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q1[2], r5
-; CHECK-NEXT:    vmov.32 q1[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[12]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q2[0], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    vmov.u8 r5, q0[13]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov.32 q2[2], r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    adds.w r12, r12, r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q1[1], r3
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    ubfx r4, r5, #12, #1
+; CHECK-NEXT:    ubfx r5, r5, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    rsb.w r5, r5, #0
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r5, q0[15]
+; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adcs r2, r5
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2697,21 +2449,19 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orr.w r12, r3, r2
@@ -2736,32 +2486,28 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vand q2, q1, q2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2785,23 +2531,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
 ; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r12, s5
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    orrs.w r3, r3, r12
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index ee15f82a71f5..efc55e209756 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -429,19 +429,17 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
-; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r0, s14
@@ -449,189 +447,163 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r12, r1, r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    orr.w lr, r3, r1
-; CHECK-NEXT:    vmov.u8 r3, q1[2]
-; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[3]
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r3
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r1, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, lr, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[4]
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r0, r1, r1, r0
+; CHECK-NEXT:    umull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r1
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    adc.w r0, r0, lr
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[4]
+; CHECK-NEXT:    adc.w r12, r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    umull r1, r3, r3, r1
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r1
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r1, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[8]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r1, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[8]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r1, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[12]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds r1, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[12]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[15]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    vand q1, q3, q2
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[14]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -643,152 +615,140 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    vmov.32 q2[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[4]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds.w lr, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds.w lr, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[8]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds.w lr, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds.w lr, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[12]
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r0, r0, r12
+; CHECK-NEXT:    adds.w lr, r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r0, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r0, r0, r12
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[14]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
@@ -798,7 +758,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smlal r0, r1, r3, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
@@ -1356,211 +1316,183 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
-; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
-; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov.u8 r4, q0[2]
-; CHECK-NEXT:    umull r12, lr, r3, r2
+; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov.u8 r4, q1[2]
 ; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov.u8 r5, q0[2]
+; CHECK-NEXT:    umull r12, lr, r2, r12
 ; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[3]
-; CHECK-NEXT:    vmov.32 q4[2], r4
-; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    orr.w lr, lr, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[2]
-; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[3]
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q0[3]
+; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
 ; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
 ; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r6, s16
 ; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov.32 q5[2], r3
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, lr, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    umull r5, r6, r6, r5
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
 ; CHECK-NEXT:    vmov.u8 r5, q1[4]
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r4
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds.w r12, r2, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[5]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
 ; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
 ; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[5]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[6]
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    umull r2, r4, r2, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vmov.u8 r4, q0[6]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[7]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[7]
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds.w r6, r6, r12
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[7]
+; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[6]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[7]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[8]
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    umull r2, r4, r2, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[9]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[9]
+; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[8]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[10]
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    umull r2, r4, r2, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[11]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[11]
+; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[11]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[12]
-; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    umull r2, r4, r2, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[13]
-; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[13]
+; CHECK-NEXT:    adc.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[12]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[13]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.32 q5[1], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r4
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r2, r2, r12
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds r2, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[14]
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[15]
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    umull r2, r4, r2, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vand q1, q3, q2
-; CHECK-NEXT:    vmov.32 q3[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov.32 q3[2], r4
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vand q0, q3, q2
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    umlal r2, r3, r4, r5
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    umlal r2, r3, r4, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[15]
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[14]
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[15]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov r6, s4
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    umlal r3, r2, r5, r6
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    umlal r3, r2, r5, r6
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -1573,166 +1505,152 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
-; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    smull r12, r3, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    sxtb.w lr, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r4, q1[2]
+; CHECK-NEXT:    smull r2, lr, r2, lr
+; CHECK-NEXT:    vmov.u8 r5, q0[2]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r12
+; CHECK-NEXT:    smull r4, r5, r5, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], lr, r3
 ; CHECK-NEXT:    vmov lr, s10
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    vmov r12, s9
 ; CHECK-NEXT:    adds.w lr, lr, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r12, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r3
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q1[4]
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    smull r2, r4, r2, r4
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    adds.w r5, lr, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, r5, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[5]
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r3, r4, r4, r3
+; CHECK-NEXT:    smull r2, r5, r5, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    adc.w r12, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[6]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r4, r2, r4
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[7]
+; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[7]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    adc.w r12, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[8]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r4, r2, r4
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[9]
+; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[8]
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    adc.w r12, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[10]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r4, r2, r4
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[11]
+; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[10]
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    adc.w r12, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[12]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r4, r2, r4
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w lr, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[13]
+; CHECK-NEXT:    adc.w r12, r2, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[12]
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    adc.w r3, r12, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[14]
-; CHECK-NEXT:    sxtb.w r12, r4
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[14]
+; CHECK-NEXT:    adcs r2, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smlal r2, r3, r4, r12
-; CHECK-NEXT:    vmov.u8 r4, q1[15]
-; CHECK-NEXT:    sxtb.w r12, r4
+; CHECK-NEXT:    smlal r3, r2, r4, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[15]
 ; CHECK-NEXT:    vmov.u8 r4, q0[15]
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smlal r2, r3, r4, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    smlal r3, r2, r4, r5
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 72462bb87f02..dbdb9fc925d7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -50,21 +50,19 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
 ; CHECK-LABEL: add_v2i32_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmullb.u32 q3, q0, q1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmullb.u32 q3, q0, q1
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -86,21 +84,19 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
 ; CHECK-LABEL: add_v2i32_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmullb.s32 q3, q0, q1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmullb.s32 q3, q0, q1
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -289,44 +285,37 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
 ; CHECK-LABEL: add_v2i16_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q4, q0, q3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vand q1, q2, q3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -342,36 +331,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vand q3, q2, q3
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    sxth r2, r2
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    sxth r3, r3
 ; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q3[1], r1
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    sxth r1, r1
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -605,22 +590,20 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #88
-; CHECK-NEXT:    sub sp, #88
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    .pad #40
+; CHECK-NEXT:    sub sp, #40
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q0
+; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
-; CHECK-NEXT:    vmov.i64 q4, #0xff
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.16 q2[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
@@ -636,181 +619,149 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vpsel q6, q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q6[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q6[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r3, q4[0]
+; CHECK-NEXT:    vpsel q6, q3, q0
 ; CHECK-NEXT:    vmov.u16 r0, q6[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q6[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q6[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q6[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q7[0], r1
-; CHECK-NEXT:    vmov.32 q7[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q7[2], r1
-; CHECK-NEXT:    vmov.32 q7[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[0]
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[1]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vand q2, q0, q4
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vand q1, q0, q4
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    umull r1, r2, r2, r1
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    umull r1, r2, r2, r1
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
+; CHECK-NEXT:    vmov q7[3], q7[1], r2, r1
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
+; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    vmov.u8 r2, q4[1]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r1, r12, r2, r1
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    umull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
 ; CHECK-NEXT:    vand q0, q0, q7
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r3, r12
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    adds.w lr, r2, r1
+; CHECK-NEXT:    vmov.u8 r1, q4[2]
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q7[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q7[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q2[2]
-; CHECK-NEXT:    vmov.32 q7[2], r0
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.32 q7[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[2]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[3]
-; CHECK-NEXT:    vmov.u8 r3, q2[3]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov q7[2], q7[0], r0, r3
+; CHECK-NEXT:    vmov q7[3], q7[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.u8 r3, q1[2]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov.u8 r3, q4[3]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r3
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q7
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    umull r0, r1, r1, r0
+; CHECK-NEXT:    umull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vand q0, q0, q7
+; CHECK-NEXT:    vmov q7, q4
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r2, r12, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q6[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q6[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u8 r1, q4[4]
+; CHECK-NEXT:    adc.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q6[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q6[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q6[5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
-; CHECK-NEXT:    and r3, lr, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r0, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[2], r3
-; CHECK-NEXT:    vmov.32 q6[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[4]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[5]
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov.u8 r0, q3[4]
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[5]
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.u8 r3, q1[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov.u8 r3, q4[5]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r3
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q6
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    umull r0, r1, r1, r0
+; CHECK-NEXT:    umull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r1
+; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    adds r3, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q3[6]
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    ubfx r0, lr, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    ubfx r0, lr, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[7]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r3, lr, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q6
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r4, q4[6]
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.u8 r3, q1[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q4[7]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q6
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
@@ -828,182 +779,152 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q6[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[15]
 ; CHECK-NEXT:    vmov.16 q6[7], r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adc.w lr, r1, r3
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
 ; CHECK-NEXT:    vmov.u8 r0, q7[8]
-; CHECK-NEXT:    vpsel q3, q1, q0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r2, q3[0]
-; CHECK-NEXT:    vmov.u8 r0, q7[9]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vpsel q3, q3, q0
 ; CHECK-NEXT:    vmov.u16 r2, q3[2]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q3[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[3]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r3, q3[1]
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    and r3, lr, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q4[0], r3
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q4[2], r3
-; CHECK-NEXT:    vmov.32 q4[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q6[8]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q6[9]
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[9]
+; CHECK-NEXT:    vmov.u8 r4, q1[8]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q7[9]
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r4
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q5, q5, q2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r1, s20
 ; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q4
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r1, r4, r1, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    adds r3, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q7[10]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q7[11]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    ubfx r0, lr, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    ubfx r0, lr, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[10]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[11]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r3, lr, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q4
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r4, q7[10]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.u8 r3, q1[10]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q7[11]
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q5, q5, q2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q4
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds.w r12, r2, r0
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    adc.w lr, r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q3[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.u16 r3, q3[5]
 ; CHECK-NEXT:    vmov.u8 r0, q7[12]
-; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[13]
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    and r3, lr, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    vmov.32 q3[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q6[12]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q6[13]
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[13]
+; CHECK-NEXT:    vmov.u8 r4, q1[12]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q7[13]
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r4
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r1, s16
 ; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    umull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q3
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r1, r4, r1, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    adds r3, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q7[14]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q7[15]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    adcs r1, r0
-; CHECK-NEXT:    ubfx r0, lr, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    ubfx r0, lr, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[14]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[15]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r3, lr, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q2, q3
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r3, r4
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r4, q7[14]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.u8 r3, q1[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q7[15]
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    umull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    add sp, #40
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -1017,15 +938,18 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    vmov.u8 r0, q4[0]
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
 ; CHECK-NEXT:    vmov.16 q5[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q4[1]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
@@ -1041,147 +965,128 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q5[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q4[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q5[0]
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[1]
-; CHECK-NEXT:    vmov.32 q6[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q5[0]
+; CHECK-NEXT:    vmov q6[2], q6[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[3]
-; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q5[1]
+; CHECK-NEXT:    vmov q6[3], q6[1], r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r0, p0
-; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    and r2, r0, #1
 ; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.32 q6[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    smull r1, r2, r2, r1
-; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q7[1], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[1]
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r1, r2, r2, r1
-; CHECK-NEXT:    vmov.32 q7[2], r1
-; CHECK-NEXT:    vmov.32 q7[3], r2
+; CHECK-NEXT:    smull r1, r12, r2, r1
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    smull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
+; CHECK-NEXT:    vmov q7[3], q7[1], r3, r12
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r1, s26
-; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov r3, s26
+; CHECK-NEXT:    vmov r1, s24
 ; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r3, r12
-; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov r2, s25
+; CHECK-NEXT:    adds.w lr, r1, r3
+; CHECK-NEXT:    ubfx r3, r0, #12, #1
+; CHECK-NEXT:    ubfx r0, r0, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    rsb.w r0, r0, #0
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q7[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q7[2], r0
-; CHECK-NEXT:    vmov.32 q7[3], r3
+; CHECK-NEXT:    smull r1, r2, r2, r1
+; CHECK-NEXT:    vmov q7[2], q7[0], r1, r0
+; CHECK-NEXT:    vmov q7[3], q7[1], r2, r3
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov r1, s24
 ; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adc.w r2, r12, r0
 ; CHECK-NEXT:    vmov r0, s26
 ; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    adc.w r1, r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
+; CHECK-NEXT:    adc.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q5[6]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[4]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[7]
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.u16 r3, q5[5]
+; CHECK-NEXT:    smull r1, r4, r4, r1
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[0], r3
-; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    and r0, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q5[2], r3
-; CHECK-NEXT:    vmov.32 q5[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[4]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
+; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
 ; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r1, r0
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r1, s20
 ; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r1, s22
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    adds r3, r3, r1
-; CHECK-NEXT:    adc.w r1, r12, r0
-; CHECK-NEXT:    ubfx r0, r2, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    ubfx r0, r2, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q6[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r3, s23
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r0, r0, lr
+; CHECK-NEXT:    adds r1, r1, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[6]
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.u8 r3, q0[7]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r2
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r12, r0
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    adc.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -1198,155 +1103,133 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vpsel q2, q3, q2
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[1]
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[0], r3
-; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    and r4, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q3[2], r3
-; CHECK-NEXT:    vmov.32 q3[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[8]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[9]
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r4
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    adds r3, r3, r1
-; CHECK-NEXT:    adc.w r1, r12, r0
-; CHECK-NEXT:    ubfx r0, r2, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    ubfx r0, r2, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.32 q4[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r0, r0, lr
+; CHECK-NEXT:    adds r1, r1, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[10]
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.u8 r3, q0[11]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.32 q4[3], r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q4[2], q4[0], r0, r2
+; CHECK-NEXT:    vmov q4[3], q4[1], r4, r3
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r12, r0
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds.w r12, r2, r0
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    adc.w lr, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q2[4]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q2[5]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    and r4, r2, #1
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[12]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r0, r3, r0, r3
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r3, q1[13]
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r4
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    adds r3, r3, r1
-; CHECK-NEXT:    adc.w r1, r12, r0
-; CHECK-NEXT:    ubfx r0, r2, #8, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    ubfx r0, r2, #12, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r1, r1, r12
+; CHECK-NEXT:    adc.w r0, r0, lr
+; CHECK-NEXT:    adds r1, r1, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[14]
+; CHECK-NEXT:    adc.w r12, r0, r3
+; CHECK-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.u8 r3, q0[15]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    smull r0, r4, r0, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r12, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -1360,44 +1243,37 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
 ; CHECK-LABEL: add_v2i8_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q4, q0, q3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vand q1, q2, q3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1413,36 +1289,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vand q3, q2, q3
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.32 q3[1], r1
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1463,40 +1335,38 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    umull r12, r3, r1, r0
-; CHECK-NEXT:    mla r1, r1, r2, r3
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.32 q3[0], r12
-; CHECK-NEXT:    mla r0, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    umull r2, r3, r1, r0
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    mla r1, r1, r2, r3
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    mla r0, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    umull r4, r5, r2, r3
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, lr
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    mla r1, r1, r4, r12
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    mla r0, r4, r0, r1
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    mla r1, r2, r1, r5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    mla r1, r2, r3, r1
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s9
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    tst.w r1, #1
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1504,7 +1374,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
@@ -1567,21 +1437,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmullb.u32 q3, q0, q1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmullb.u32 q3, q0, q1
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1608,21 +1476,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmullb.s32 q3, q0, q1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmullb.s32 q3, q0, q1
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1826,37 +1692,31 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q4, q0, q3
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    umull lr, r12, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.32 q0[3], r3
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    umull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1865,7 +1725,6 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    adc.w r3, lr, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -1884,36 +1743,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vand q3, q2, q3
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxth r2, r2
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r3
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    smull lr, r12, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    smull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2166,18 +2021,16 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q0
+; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
-; CHECK-NEXT:    vmov.i64 q4, #0xff
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.16 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r2
@@ -2193,184 +2046,152 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q2[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vpsel q6, q1, q0
-; CHECK-NEXT:    vmov.u16 r2, q6[0]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q6[1]
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vpsel q6, q3, q0
 ; CHECK-NEXT:    vmov.u16 r2, q6[2]
-; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q6[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[3]
-; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q6[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    and r3, lr, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q7[0], r3
-; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    and r2, lr, #1
 ; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q7[2], r3
-; CHECK-NEXT:    vmov.32 q7[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[0]
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[1]
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov q7[2], q7[0], r2, r3
+; CHECK-NEXT:    vmov q7[3], q7[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
 ; CHECK-NEXT:    vmov.u8 r3, q1[0]
-; CHECK-NEXT:    vand q2, q0, q4
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov r12, s8
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vand q1, q0, q4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    umull r3, r2, r3, r12
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q4[1]
+; CHECK-NEXT:    vmov.u8 r2, q4[0]
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    umull r2, r12, r2, r12
+; CHECK-NEXT:    umull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r12
 ; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r4, r4, r2
-; CHECK-NEXT:    ubfx r2, lr, #8, #1
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.32 q7[0], r2
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov.32 q7[1], r2
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds r6, r2, r3
 ; CHECK-NEXT:    ubfx r2, lr, #12, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q2[2]
-; CHECK-NEXT:    vmov.32 q7[2], r2
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.32 q7[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[2]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q3[3]
-; CHECK-NEXT:    vmov.u8 r3, q2[3]
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r3, q4[2]
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    ubfx r4, lr, #8, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov q7[2], q7[0], r4, r2
+; CHECK-NEXT:    vmov q7[3], q7[1], r4, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.u8 r4, q1[2]
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
+; CHECK-NEXT:    vmov.u8 r4, q4[3]
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r5, s12
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q7
+; CHECK-NEXT:    umull r5, r4, r5, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT:    vmov.u8 r4, q4[4]
+; CHECK-NEXT:    vand q0, q0, q7
+; CHECK-NEXT:    vmov q7, q4
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w lr, r12, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    adds.w r12, r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q6[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q6[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r5
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q6[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q6[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov.u16 r6, q6[5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs r6, p0
-; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
-; CHECK-NEXT:    and r4, r6, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q6[0], r4
-; CHECK-NEXT:    vmov.32 q6[1], r4
-; CHECK-NEXT:    ubfx r4, r6, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov.32 q6[2], r4
-; CHECK-NEXT:    vmov.32 q6[3], r4
-; CHECK-NEXT:    vmov.u8 r4, q3[4]
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q3[5]
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q0[2], r4
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov.u8 r3, q3[4]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov q6[2], q6[0], r5, r6
+; CHECK-NEXT:    vmov q6[3], q6[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[5]
+; CHECK-NEXT:    vmov.u8 r5, q1[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q4[5]
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r5, s14
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[5]
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    umull r6, r5, r5, r6
 ; CHECK-NEXT:    umull r3, r4, r3, r4
-; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.32 q2[1], r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov.32 q2[2], r3
-; CHECK-NEXT:    vmov.32 q2[3], r4
-; CHECK-NEXT:    vand q0, q2, q6
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q6
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    adds r4, r4, r5
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    ubfx r2, r6, #8, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q3[6]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.32 q6[1], r2
-; CHECK-NEXT:    ubfx r2, r6, #12, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q6[2], r2
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adc.w r5, lr, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    ubfx r6, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r6, r6, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r6
+; CHECK-NEXT:    vmov.u8 r5, q4[6]
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q3[7]
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q1[2], r3
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vand q1, q1, q4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r6, q1[6]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.u8 r6, q4[7]
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    vand q0, q2, q6
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    umull r2, r6, r6, r2
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT:    vmov.u8 r4, q7[8]
+; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    adds r3, r3, r6
 ; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    adds r3, r3, r4
 ; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    vmov.u8 r5, q7[8]
-; CHECK-NEXT:    adc.w r3, r2, r6
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[9]
@@ -2388,84 +2209,69 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u8 r2, q5[15]
 ; CHECK-NEXT:    vmov.16 q6[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
-; CHECK-NEXT:    vpsel q3, q1, q0
-; CHECK-NEXT:    vmov.32 q1[0], r5
-; CHECK-NEXT:    vmov.u16 r2, q3[0]
-; CHECK-NEXT:    vmov.u8 r5, q7[9]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vpsel q3, q3, q0
 ; CHECK-NEXT:    vmov.u16 r2, q3[2]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q3[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[3]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov.u16 r6, q3[1]
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    and r6, r2, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.32 q4[0], r6
-; CHECK-NEXT:    vmov.32 q4[1], r6
+; CHECK-NEXT:    and r5, r2, #1
 ; CHECK-NEXT:    ubfx r6, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
 ; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.32 q4[2], r6
-; CHECK-NEXT:    vmov.32 q4[3], r6
-; CHECK-NEXT:    vmov.u8 r6, q6[8]
-; CHECK-NEXT:    vmov.32 q0[0], r6
-; CHECK-NEXT:    vmov.u8 r6, q6[9]
-; CHECK-NEXT:    vmov.32 q0[2], r6
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    vmov.32 q2[0], r6
+; CHECK-NEXT:    vmov q4[2], q4[0], r5, r6
+; CHECK-NEXT:    vmov q4[3], q4[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[9]
+; CHECK-NEXT:    vmov.u8 r5, q1[8]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[9]
+; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q5, q5, q2
 ; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    vmov.32 q2[2], r6
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vand q0, q2, q4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    adds.w r6, r12, r4
 ; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r5, r5, r12
-; CHECK-NEXT:    adcs r6, r3
+; CHECK-NEXT:    adc.w r5, lr, r3
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    adc.w r12, r6, r4
-; CHECK-NEXT:    ubfx r6, r2, #8, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q4[0], r6
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q4[1], r6
-; CHECK-NEXT:    vmov.u8 r6, q7[10]
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q1[0], r6
-; CHECK-NEXT:    vmov.32 q4[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q6[10]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q6[11]
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    ubfx r6, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r6, r6, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[10]
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r6
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.u8 r6, q1[10]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u8 r6, q7[11]
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q1[2], r6
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov q5[2], q5[0], r5, r6
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q5, q5, q2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r6
-; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov r6, s22
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r4, s20
 ; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r6
-; CHECK-NEXT:    vand q0, q2, q4
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT:    vmov.u8 r4, q7[12]
+; CHECK-NEXT:    vand q0, q0, q4
 ; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
@@ -2473,83 +2279,68 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r6, s3
 ; CHECK-NEXT:    adc.w r2, r2, r12
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    vmov.u8 r5, q7[12]
-; CHECK-NEXT:    vmov.32 q1[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q7[13]
-; CHECK-NEXT:    vmov.32 q1[2], r5
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    adc.w r3, r2, r6
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q3[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q3[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r6, q3[5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r6, r2, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.32 q3[0], r6
-; CHECK-NEXT:    vmov.32 q3[1], r6
+; CHECK-NEXT:    and r5, r2, #1
 ; CHECK-NEXT:    ubfx r6, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
 ; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    vmov.32 q3[2], r6
-; CHECK-NEXT:    vmov.32 q3[3], r6
-; CHECK-NEXT:    vmov.u8 r6, q6[12]
-; CHECK-NEXT:    vmov.32 q0[0], r6
-; CHECK-NEXT:    vmov.u8 r6, q6[13]
-; CHECK-NEXT:    vmov.32 q0[2], r6
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    vmov.32 q2[0], r6
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[13]
+; CHECK-NEXT:    vmov.u8 r5, q1[12]
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[13]
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    vmov.32 q2[2], r6
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vand q0, q2, q3
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    umull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    adds.w r6, r12, r4
 ; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r5, r5, r12
-; CHECK-NEXT:    adcs r6, r3
+; CHECK-NEXT:    adc.w r5, lr, r3
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    adc.w r12, r6, r4
-; CHECK-NEXT:    ubfx r6, r2, #8, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r6
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q3[1], r6
-; CHECK-NEXT:    vmov.u8 r6, q7[14]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q1[0], r6
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q6[14]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q6[15]
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    ubfx r6, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsb.w r6, r6, #0
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    adc.w r12, r5, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[14]
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r6
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.u8 r6, q1[14]
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    vmov.u8 r6, q7[15]
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q1[2], r6
-; CHECK-NEXT:    vand q0, q0, q5
-; CHECK-NEXT:    vand q1, q1, q5
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r6
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.32 q2[1], r6
 ; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r6
-; CHECK-NEXT:    vand q0, q2, q3
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
@@ -2560,7 +2351,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r2, r6
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    add sp, #80
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
@@ -2577,15 +2368,18 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vmov.u8 r4, q0[0]
 ; CHECK-NEXT:    vpsel q4, q3, q2
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov.u8 r2, q4[0]
+; CHECK-NEXT:    vmov.u8 r5, q0[2]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[1]
 ; CHECK-NEXT:    vmov.16 q5[1], r2
@@ -2601,151 +2395,132 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q5[0]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[1]
-; CHECK-NEXT:    vmov.32 q6[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[0]
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[3]
-; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[0], r3
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    vmrs r12, p0
+; CHECK-NEXT:    and r2, r12, #1
+; CHECK-NEXT:    ubfx r3, r12, #4, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov.32 q6[2], r3
-; CHECK-NEXT:    vmov.32 q6[3], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[0]
-; CHECK-NEXT:    sxtb.w r12, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r12, r3, r12
-; CHECK-NEXT:    vmov.32 q7[0], r3
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov.32 q7[1], r12
-; CHECK-NEXT:    sxtb.w r12, r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r12, r3, r12
-; CHECK-NEXT:    vmov.32 q7[2], r3
-; CHECK-NEXT:    vmov.32 q7[3], r12
+; CHECK-NEXT:    smull r2, lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r3, r4, r4, r3
+; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
+; CHECK-NEXT:    vmov q7[3], q7[1], r4, lr
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r5, s25
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov lr, s27
+; CHECK-NEXT:    vmov r3, s25
+; CHECK-NEXT:    adds r6, r2, r4
+; CHECK-NEXT:    ubfx r4, r12, #12, #1
+; CHECK-NEXT:    ubfx r2, r12, #8, #1
+; CHECK-NEXT:    rsb.w r4, r4, #0
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.32 q6[2], r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adc.w r12, r12, r5
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q7[0], r2
+; CHECK-NEXT:    adc.w lr, lr, r3
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r4
+; CHECK-NEXT:    vmov.u8 r3, q1[2]
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r4
 ; CHECK-NEXT:    vmov.u8 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q7[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r4, q0[3]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q7[2], r2
-; CHECK-NEXT:    vmov.32 q7[3], r3
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    smull r3, r5, r5, r3
+; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
+; CHECK-NEXT:    vmov q7[3], q7[1], r5, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[4]
 ; CHECK-NEXT:    vand q6, q7, q6
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov r4, s26
-; CHECK-NEXT:    vmov r5, s27
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.32 q6[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov r5, s26
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s27
+; CHECK-NEXT:    adc.w r2, r2, lr
+; CHECK-NEXT:    adds.w r12, r3, r5
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q5[6]
-; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q5[4]
+; CHECK-NEXT:    vmov q6[2], q6[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[7]
-; CHECK-NEXT:    vmov.32 q6[3], r2
-; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.u16 r6, q5[5]
+; CHECK-NEXT:    vmov q6[3], q6[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q5[0], r5
-; CHECK-NEXT:    vmov.32 q5[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q5[2], r5
-; CHECK-NEXT:    vmov.32 q5[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[4]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q6[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[5]
-; CHECK-NEXT:    vmov.32 q6[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov q5[2], q5[0], r5, r6
+; CHECK-NEXT:    vmov q5[3], q5[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[5]
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    sxtb r6, r6
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q6[2], r5
-; CHECK-NEXT:    vmov.32 q6[3], r4
+; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r6
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r5
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r5, s21
-; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    vmov r5, s23
+; CHECK-NEXT:    adds.w r6, r12, r4
 ; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r6, r6, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[6]
+; CHECK-NEXT:    adc.w r12, r3, r5
+; CHECK-NEXT:    ubfx r5, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    vmov.32 q5[2], r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q5[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov q5[2], q5[0], r2, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q5[3], q5[1], r2, r5
 ; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r5, q0[7]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q6[2], r2
-; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r2, r5, r5, r2
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov q6[3], q6[1], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q1[8]
 ; CHECK-NEXT:    vand q5, q6, q5
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r5, s23
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s23
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    adc.w r3, r2, r5
+; CHECK-NEXT:    adds.w r12, r3, r5
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -2762,155 +2537,131 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q2[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r6, q2[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.32 q3[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q3[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[8]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q4[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[9]
-; CHECK-NEXT:    vmov.32 q4[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[9]
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    sxtb r6, r6
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q4[2], r5
-; CHECK-NEXT:    vmov.32 q4[3], r4
+; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r6
+; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    adds.w r6, r12, r4
 ; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r6, r6, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[10]
+; CHECK-NEXT:    adc.w r12, r3, r5
+; CHECK-NEXT:    ubfx r5, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r5
 ; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.32 q4[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r5, q0[11]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q4[2], r2
-; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r2, r5, r5, r2
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
+; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
+; CHECK-NEXT:    vmov.u8 r4, q1[12]
 ; CHECK-NEXT:    vand q3, q4, q3
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s15
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    adc.w r3, r2, r5
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    adds.w r12, r3, r5
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    adc.w lr, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r6, q2[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.u16 r6, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q2[0], r5
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov.32 q2[2], r5
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[12]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q1[13]
-; CHECK-NEXT:    vmov.32 q3[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r5, r6
+; CHECK-NEXT:    vmov q2[3], q2[1], r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[13]
+; CHECK-NEXT:    vmov.u8 r5, q0[13]
+; CHECK-NEXT:    sxtb r6, r6
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    vmov.32 q3[2], r5
-; CHECK-NEXT:    vmov.32 q3[3], r4
+; CHECK-NEXT:    smull r6, r5, r5, r6
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmov r5, s11
+; CHECK-NEXT:    adds.w r6, r12, r4
 ; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adcs r5, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r5, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r6, r6, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[14]
+; CHECK-NEXT:    adc.w r12, r3, r5
+; CHECK-NEXT:    ubfx r5, r2, #12, #1
+; CHECK-NEXT:    ubfx r2, r2, #8, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r5
 ; CHECK-NEXT:    vmov.u8 r2, q1[15]
-; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r5, q0[15]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r3
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r2, r5, r5, r2
+; CHECK-NEXT:    smull r3, r4, r3, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    vmov r6, s3
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adcs r2, r6
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -2927,37 +2678,31 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q4, q0, q3
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    umull lr, r12, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.32 q0[3], r3
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    umull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2966,7 +2711,6 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    adc.w r3, lr, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -2985,36 +2729,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vand q3, q2, q3
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov.32 q3[3], r3
-; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    smull lr, r12, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    smull r2, r3, r2, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -3038,52 +2778,48 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
 ; CHECK-LABEL: add_v2i64_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    umull r12, lr, r3, r2
-; CHECK-NEXT:    mla r3, r3, r4, lr
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov.32 q3[0], r12
-; CHECK-NEXT:    mla r2, r4, r2, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    umull r4, r12, r3, r2
-; CHECK-NEXT:    vmov.32 q3[2], r4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    mla r3, r3, r4, r12
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    mla r2, r4, r2, r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    cset r2, eq
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    umull r12, lr, r3, r2
+; CHECK-NEXT:    umull r6, r7, r5, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r12
+; CHECK-NEXT:    vmov r6, s7
+; CHECK-NEXT:    mla r3, r3, r6, lr
+; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    mla r2, r6, r2, r3
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    mla r3, r5, r3, r7
+; CHECK-NEXT:    vmov r7, s1
+; CHECK-NEXT:    mla r3, r7, r4, r3
+; CHECK-NEXT:    vmov r7, s8
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
 ; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s9
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    orrs r3, r7
+; CHECK-NEXT:    cset r3, eq
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    adds r7, r7, r6
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    adds r0, r0, r7
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
index 0a5f21058687..ff3b46aa95d0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
@@ -87,11 +87,9 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adc.w r12, r12, lr
 ; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov.32 q0[0], r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], r6
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index 7c8c0bae6bec..f33a7237151c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -280,11 +280,9 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -299,62 +297,58 @@ entry:
 define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld2_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d8, d7
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov r12, s19
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s10, s22
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vmov.f32 s11, s23
+; CHECK-NEXT:    vmov.f32 s3, s21
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov r12, s11
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov.f32 s7, s17
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r6, s4
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov.32 q3[2], lr
-; CHECK-NEXT:    vmov.32 q3[3], r12
-; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    adc.w r12, r2, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
 ; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    adds r4, r4, r6
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, lr
+; CHECK-NEXT:    adcs r0, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r12
+; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 4
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index ad7f80560195..22ba78a9cbcb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -292,31 +292,25 @@ define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.32 q1[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r3, q0[1]
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
 ; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q1, q2
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -334,82 +328,78 @@ entry:
 define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.f32 s22, s12
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.f32 s22, s8
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmovnb.i32 q3, q4
 ; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmovnb.i32 q6, q4
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vadd.i16 q1, q4, q3
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovnb.i32 q1, q5
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
+; CHECK-NEXT:    vadd.i16 q0, q4, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q3
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x i16>, <24 x i16>* %src, align 4
@@ -427,151 +417,143 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
 ; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.f32 s22, s12
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmovnb.i32 q6, q4
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s22, s8
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmovnb.i32 q3, q4
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
 ; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
 ; CHECK-NEXT:    vmov.16 q4[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.16 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q5[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[0]
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[3]
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[6]
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov.32 q3[3], r2
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vadd.i16 q1, q4, q3
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q5[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.16 q5[4], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q5[2], r2
-; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.16 q0[5], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.f32 s26, s12
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov q7, q6
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmovnb.i32 q7, q5
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov r0, s27
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmovnb.i32 q1, q5
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vadd.i16 q0, q4, q1
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vadd.i16 q0, q0, q3
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[1]
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s7, s23
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
 ; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
 ; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q7[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.16 q7[7], r0
+; CHECK-NEXT:    vmov.f32 s22, s15
+; CHECK-NEXT:    vmov.f32 s30, s16
+; CHECK-NEXT:    vmov q6, q7
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmovnb.i32 q6, q5
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r0
+; CHECK-NEXT:    vmov r0, s31
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r0
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q7[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.16 q7[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q7[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.16 q7[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmovnb.i32 q3, q6
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vadd.i16 q1, q5, q1
-; CHECK-NEXT:    vadd.i16 q1, q1, q4
+; CHECK-NEXT:    vmov.16 q7[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmovnb.i32 q2, q7
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vmov r2, s29
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
+; CHECK-NEXT:    vadd.i16 q1, q1, q2
+; CHECK-NEXT:    vadd.i16 q1, q1, q6
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -625,42 +607,38 @@ entry:
 define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrb.u16 q2, [r0]
-; CHECK-NEXT:    ldr r3, [r0, #8]
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    str r3, [sp]
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vldrb.u16 q2, [r2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vadd.i32 q1, q1, q3
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    ldr r0, [r0, #8]
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    vmov.u16 r3, q0[6]
+; CHECK-NEXT:    vmov.u16 r4, q0[0]
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
+; CHECK-NEXT:    vmov.u16 r3, q0[7]
+; CHECK-NEXT:    vmov.u16 r4, q0[1]
+; CHECK-NEXT:    vmov.u16 r12, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    vmov.u16 lr, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vldrb.u16 q0, [r3]
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov q2[2], q2[0], lr, r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov q2[3], q2[1], r12, r0
+; CHECK-NEXT:    vadd.i32 q0, q1, q2
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %l1 = load <12 x i8>, <12 x i8>* %src, align 4
   %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -743,81 +721,76 @@ entry:
 define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.u8 r2, q2[0]
-; CHECK-NEXT:    vmov.8 q1[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q2[3]
-; CHECK-NEXT:    vmov.8 q1[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q2[6]
-; CHECK-NEXT:    vmov.8 q1[2], r2
-; CHECK-NEXT:    vmov.u8 r2, q2[9]
-; CHECK-NEXT:    vmov.8 q1[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q2[12]
-; CHECK-NEXT:    vmov.8 q1[4], r2
-; CHECK-NEXT:    vmov.u8 r2, q2[15]
-; CHECK-NEXT:    vmov.8 q1[5], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov.8 q1[6], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov.8 q1[7], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.32 q3[0], r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.32 q3[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.8 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.8 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov.8 q3[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    vmov.8 q3[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    vmov.8 q3[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.8 q3[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[2]
+; CHECK-NEXT:    vmov.8 q3[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[8]
 ; CHECK-NEXT:    vmov.8 q4[8], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.u8 r2, q2[11]
 ; CHECK-NEXT:    vmov.8 q4[9], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov.u8 r2, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[10], r2
 ; CHECK-NEXT:    vmov.u8 r0, q1[1]
 ; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.8 q3[7], r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov.8 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[4]
-; CHECK-NEXT:    vmov.8 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[7]
-; CHECK-NEXT:    vmov.8 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[10]
-; CHECK-NEXT:    vmov.8 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[13]
-; CHECK-NEXT:    vmov.8 q5[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q5[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q5[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q5[7], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
 ; CHECK-NEXT:    vmov.8 q5[8], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
 ; CHECK-NEXT:    vmov.8 q5[9], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
 ; CHECK-NEXT:    vmov.8 q5[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.8 q5[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.8 q5[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.8 q5[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov.8 q5[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.8 q5[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[0]
+; CHECK-NEXT:    vmov.8 q5[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[3]
+; CHECK-NEXT:    vmov.8 q5[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[6]
+; CHECK-NEXT:    vmov.8 q5[7], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov q6[2], q6[0], r2, r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.8 q7[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.8 q7[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.8 q7[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.8 q5[15], r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.8 q7[15], r0
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov r0, s31
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[7]
@@ -826,52 +799,51 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[13]
 ; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov r2, s13
 ; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
-; CHECK-NEXT:    vadd.i8 q3, q3, q4
-; CHECK-NEXT:    vmov.8 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[5]
-; CHECK-NEXT:    vmov.8 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[8]
-; CHECK-NEXT:    vmov.8 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[11]
-; CHECK-NEXT:    vmov.8 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[14]
-; CHECK-NEXT:    vmov.8 q4[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q4[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q4[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q4[7], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vadd.i8 q3, q4, q6
 ; CHECK-NEXT:    vmov.8 q4[8], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
 ; CHECK-NEXT:    vmov.8 q4[9], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.8 q4[11], r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.8 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov.8 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    vmov.8 q4[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.8 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov.8 q4[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[1]
+; CHECK-NEXT:    vmov.8 q4[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[4]
+; CHECK-NEXT:    vmov.8 q4[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q2[7]
+; CHECK-NEXT:    vmov.8 q4[7], r2
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.8 q0[12], r0
+; CHECK-NEXT:    vmov.8 q2[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.8 q0[13], r0
+; CHECK-NEXT:    vmov.8 q2[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.8 q0[14], r0
+; CHECK-NEXT:    vmov.8 q2[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.8 q0[15], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vadd.i8 q0, q3, q2
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vadd.i8 q0, q3, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <48 x i8>, <48 x i8>* %src, align 4
@@ -925,11 +897,9 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    adcs r2, r3
 ; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, pc}
@@ -947,93 +917,89 @@ entry:
 define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld3_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #24
 ; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f64 d4, d0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s9, s1
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d14, d11
-; CHECK-NEXT:    vmov.f32 s29, s23
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vmov.f32 s22, s26
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.f32 s31, s1
-; CHECK-NEXT:    vmov r3, s30
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s18, s4
+; CHECK-NEXT:    vmov.f32 s19, s5
+; CHECK-NEXT:    vmov.f64 d12, d11
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vmov.f32 s26, s4
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vmov.f32 s27, s5
+; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov r3, s26
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s10, s18
-; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vmov.f32 s11, s19
-; CHECK-NEXT:    vmov.f32 s15, s17
-; CHECK-NEXT:    vmov.f64 d8, d12
-; CHECK-NEXT:    vmov.f32 s17, s25
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s19, s3
-; CHECK-NEXT:    vmov r12, s31
+; CHECK-NEXT:    vmov.f64 d14, d6
+; CHECK-NEXT:    vmov r12, s27
 ; CHECK-NEXT:    vmov r2, s23
+; CHECK-NEXT:    vmov.f32 s29, s13
+; CHECK-NEXT:    vmov.f32 s30, s6
+; CHECK-NEXT:    vmov.f32 s31, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    vmov r7, s24
 ; CHECK-NEXT:    adds.w lr, r0, r3
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    adc.w r3, r2, r12
+; CHECK-NEXT:    vmov r2, s31
+; CHECK-NEXT:    adds.w lr, lr, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    adc.w r12, r3, r2
 ; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, lr, r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov r3, s29
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s7, s3
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    adds.w lr, r3, r4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    adc.w r12, r0, r2
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adc.w r8, r2, r3
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds r5, r5, r6
+; CHECK-NEXT:    vmov r6, s21
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r0
+; CHECK-NEXT:    vmov r0, s29
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    adds r2, r2, r7
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r8
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    adcs r4, r6
+; CHECK-NEXT:    vmov r6, s28
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    adcs r0, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <12 x i64>, <12 x i64>* %src, align 4
   %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1353,97 +1319,93 @@ entry:
 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vmov r3, s9
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov.16 q0[0], r3
 ; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    vmov.16 q0[2], r2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s19
 ; CHECK-NEXT:    vmov.16 q0[3], r2
 ; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmovx.f16 s20, s16
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.16 q3[6], r3
 ; CHECK-NEXT:    vmov.16 q3[7], r0
 ; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov.16 q5[5], r0
 ; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov lr, s22
+; CHECK-NEXT:    vmov r12, s22
 ; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[6], r3
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[4], r3
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmovx.f16 s20, s18
+; CHECK-NEXT:    vmov lr, s23
+; CHECK-NEXT:    vmovx.f16 s20, s6
 ; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q4[6], r2
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmovx.f16 s20, s5
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmovx.f16 s20, s8
 ; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q5[1], r0
 ; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov.16 q5[2], r0
+; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmovx.f16 s24, s11
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmovx.f16 s24, s4
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.16 q6[0], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s9
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov.16 q5[0], r5
+; CHECK-NEXT:    vmov.16 q5[1], r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vmov.16 q5[2], r4
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s18
+; CHECK-NEXT:    vmov.16 q5[3], r4
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov.16 q5[4], r4
+; CHECK-NEXT:    vmov r4, s17
 ; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov.16 q2[6], r4
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov.16 q5[5], r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q2[7], r5
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r12
 ; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.32 q1[1], r4
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.32 q1[2], lr
-; CHECK-NEXT:    vmov.32 q0[2], r3
-; CHECK-NEXT:    vmov r4, s15
-; CHECK-NEXT:    vmov.f32 s23, s19
-; CHECK-NEXT:    vmov.32 q0[3], r12
-; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, lr
+; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r5
 ; CHECK-NEXT:    vadd.f16 q0, q5, q0
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
   %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1458,182 +1420,174 @@ entry:
 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld3_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmovx.f16 s4, s16
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmovx.f16 s20, s13
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmovx.f16 s4, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmovx.f16 s20, s17
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov.16 q0[6], r2
 ; CHECK-NEXT:    vmov r12, s4
-; CHECK-NEXT:    vmovx.f16 s4, s10
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vmov r2, s13
 ; CHECK-NEXT:    vmov.16 q0[7], r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov.16 q1[0], r2
 ; CHECK-NEXT:    vmov.16 q1[1], r3
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov.16 q1[2], r2
 ; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmovx.f16 s24, s12
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmovx.f16 s16, s18
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmovx.f16 s24, s13
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmovx.f16 s28, s16
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov r4, s14
 ; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s9
 ; CHECK-NEXT:    vmov.16 q5[5], r12
-; CHECK-NEXT:    vmov lr, s22
-; CHECK-NEXT:    vmovx.f16 s20, s14
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    vmov.16 q5[5], r3
 ; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[6], r3
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.16 q4[6], r3
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    vmov.16 q4[7], r4
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmovx.f16 s20, s10
 ; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov.16 q5[1], r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    vmov.16 q5[2], r3
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov.16 q5[3], r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmovx.f16 s24, s15
-; CHECK-NEXT:    vmov.16 q5[4], r3
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmov.16 q5[5], r3
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov.16 q5[7], r3
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r2, s24
 ; CHECK-NEXT:    vmov.16 q6[0], r3
-; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmov.16 q6[1], r4
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmovx.f16 s28, s19
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmovx.f16 s28, s18
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov.16 q7[4], r2
+; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmov.16 q7[5], r3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q2[6], r3
+; CHECK-NEXT:    vmov.16 q2[7], r2
+; CHECK-NEXT:    vmov lr, s30
+; CHECK-NEXT:    vmov r6, s11
+; CHECK-NEXT:    vmovx.f16 s8, s12
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmovx.f16 s12, s15
+; CHECK-NEXT:    vmov.16 q2[0], r3
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov.16 q2[1], r4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov.16 q2[2], r3
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov.16 q2[3], r3
+; CHECK-NEXT:    vmov.f32 s27, s23
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov.32 q2[0], r5
-; CHECK-NEXT:    vmov.16 q6[2], r3
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov.16 q6[3], r3
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov.32 q2[1], r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.32 q2[2], lr
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.f32 s23, s19
-; CHECK-NEXT:    vmov.32 q0[2], r12
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vmov.32 q2[3], r5
-; CHECK-NEXT:    vadd.f16 q0, q5, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    vmov q2[2], q2[0], r5, r12
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r5
+; CHECK-NEXT:    vadd.f16 q0, q6, q0
+; CHECK-NEXT:    vmovx.f16 s12, s16
+; CHECK-NEXT:    vadd.f16 q1, q0, q2
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s19
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmovx.f16 s12, s16
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.16 q1[7], r2
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s10
 ; CHECK-NEXT:    vmov r2, s9
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.16 q3[1], r3
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s20, s1
 ; CHECK-NEXT:    vmov.16 q3[2], r2
 ; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmovx.f16 s24, s9
 ; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmovx.f16 s16, s18
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmovx.f16 s28, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s6, s16
+; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q5[5], r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r5, s20
-; CHECK-NEXT:    vmov.16 q5[6], r5
-; CHECK-NEXT:    vmov r5, s17
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmovx.f16 s20, s18
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.16 q5[7], r3
-; CHECK-NEXT:    vmov.16 q4[6], r5
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    vmov.16 q4[7], r4
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r5, s20
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov.16 q5[1], r5
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    vmov.16 q5[2], r5
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmov.16 q5[3], r5
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov.16 q5[4], r5
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmov.16 q5[5], r5
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov.16 q6[0], r5
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q6[0], r3
+; CHECK-NEXT:    vmov.16 q6[1], r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.16 q6[2], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmov.16 q6[3], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmovx.f16 s28, s3
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmovx.f16 s28, s2
+; CHECK-NEXT:    vmov.16 q6[5], r2
+; CHECK-NEXT:    vmov r2, s28
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmovx.f16 s16, s17
+; CHECK-NEXT:    vmov.16 q7[4], r2
+; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vmov.16 q7[5], r3
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov.16 q4[6], r6
+; CHECK-NEXT:    vmov r2, s30
+; CHECK-NEXT:    vmov.16 q4[7], r3
+; CHECK-NEXT:    vmov.f32 s27, s23
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov r6, s16
 ; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmov.16 q6[1], r4
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.16 q6[2], r5
-; CHECK-NEXT:    vmov r5, s5
-; CHECK-NEXT:    vmov.16 q6[3], r5
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.32 q0[0], r5
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.32 q0[1], r4
-; CHECK-NEXT:    vmov.f32 s23, s19
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.32 q0[3], r3
-; CHECK-NEXT:    vadd.f16 q0, q5, q0
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vmov.16 q4[0], r6
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    vmov.16 q4[1], r5
+; CHECK-NEXT:    vmov.16 q4[2], r6
+; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    vmov.16 q4[3], r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
+; CHECK-NEXT:    vmov r5, s17
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
+; CHECK-NEXT:    vmov r4, s13
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r0
+; CHECK-NEXT:    vadd.f16 q1, q6, q1
+; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4
   %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index e8f336871326..8163e550b6f8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -152,10 +152,8 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    adcs r4, r5
 ; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    adc.w r3, r4, r6
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index 776598d06db7..c05eaa3fa09a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -229,41 +229,33 @@ define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vadd.i32 q2, q3, q2
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vadd.i32 q2, q3, q2
+; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
 ; CHECK-NEXT:    vadd.i32 q0, q3, q4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
@@ -389,15 +381,13 @@ define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vrev32.8 q2, q0
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r2, q0[2]
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vrev32.8 q2, q0
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
 ; CHECK-NEXT:    vrev16.8 q2, q0
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
@@ -589,10 +579,8 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r5
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -612,123 +600,118 @@ entry:
 define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld4_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #72
-; CHECK-NEXT:    sub sp, #72
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d8, d3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s17, s7
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s19, s3
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT:    vmov.f64 d12, d11
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d3
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s25, s23
-; CHECK-NEXT:    vmov.f32 s26, s2
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT:    vmov.f64 d14, d9
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d4, d15
-; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vmov.f32 s30, s2
+; CHECK-NEXT:    vmov.f64 d4, d13
+; CHECK-NEXT:    vmov.f32 s31, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s9, s31
+; CHECK-NEXT:    vmov.f32 s9, s27
 ; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s30, s0
+; CHECK-NEXT:    vmov.f32 s26, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vmov.f32 s27, s1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov.f64 d10, d7
 ; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov.f32 s22, s0
-; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vmov r2, s27
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s23, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vmov.f32 s18, s0
+; CHECK-NEXT:    vmov.f32 s19, s1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    vmov r7, s16
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r2, s23
 ; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r4, r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r0, s29
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adcs r3, r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov r7, s6
 ; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s20
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    adds r0, r0, r5
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    adc.w r8, r3, r2
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov r4, s21
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    adcs r4, r0
+; CHECK-NEXT:    adds.w r9, r5, r2
+; CHECK-NEXT:    vmov r5, s30
+; CHECK-NEXT:    adc.w r8, r4, r3
+; CHECK-NEXT:    vmov r2, s31
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    vmov r6, s3
 ; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r4, s7
 ; CHECK-NEXT:    adds r3, r3, r7
-; CHECK-NEXT:    vmov r7, s28
+; CHECK-NEXT:    vmov r7, s12
 ; CHECK-NEXT:    adcs r4, r6
 ; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    vmov r6, s8
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov r5, s29
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov.32 q0[3], r8
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov r6, s20
+; CHECK-NEXT:    adc.w r10, r4, r2
+; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    vmov q1[2], q1[0], r9, r3
+; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov q1[3], q1[1], r8, r10
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    adds r6, r6, r7
+; CHECK-NEXT:    vmov r7, s25
 ; CHECK-NEXT:    adcs r4, r5
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    adds r0, r0, r5
-; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r5, s9
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r7, r5
 ; CHECK-NEXT:    adds r0, r0, r6
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], lr
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    adc.w r0, r4, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #72
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %l1 = load <16 x i64>, <16 x i64>* %src, align 4
   %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 279665604967..f3b2c96679b3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -5,14 +5,12 @@ define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
 ; CHECK-LABEL: vmulhs_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q1
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <2 x i32> %s0 to <2 x i64>
@@ -48,28 +46,28 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
 ; CHECK-LABEL: vmulhs_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vmov.f32 s12, s0
-; CHECK-NEXT:    vmov.f32 s14, s1
 ; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmullb.s32 q0, q1, q3
+; CHECK-NEXT:    vmullb.s32 q5, q1, q4
 ; CHECK-NEXT:    smmul r0, r1, r0
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    smmul r1, r2, r1
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov r1, s21
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    smmul r0, r1, r0
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -141,22 +139,19 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhs_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmullb.s16 q2, q3, q2
 ; CHECK-NEXT:    vshr.s32 q3, q2, #16
 ; CHECK-NEXT:    vmov r0, s12
@@ -167,22 +162,17 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmullb.s16 q0, q1, q3
 ; CHECK-NEXT:    vshr.s32 q0, q0, #16
 ; CHECK-NEXT:    vmov r0, s0
@@ -207,22 +197,19 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhu_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmullb.u16 q2, q3, q2
 ; CHECK-NEXT:    vshr.u32 q3, q2, #16
 ; CHECK-NEXT:    vmov r0, s12
@@ -233,22 +220,17 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmullb.u16 q0, q1, q3
 ; CHECK-NEXT:    vshr.u32 q0, q0, #16
 ; CHECK-NEXT:    vmov r0, s0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
index ecb119888c86..bf1153fe0e26 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
@@ -4,38 +4,33 @@
 define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) {
 ; CHECK-LABEL: test32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, lr}
-; CHECK-NEXT:    push {r5, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    poplt {r5, pc}
+; CHECK-NEXT:    blt .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmullt.s32 q3, q2, q1
-; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    vmov r12, s12
-; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov.32 q0[0], r12
-; CHECK-NEXT:    vmov r12, s14
-; CHECK-NEXT:    vmov.32 q0[1], r5
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmullt.s32 q0, q2, q1
 ; CHECK-NEXT:    vmullb.s32 q3, q2, q1
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov r12, s12
-; CHECK-NEXT:    vmov.32 q0[3], r5
-; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r7, s1
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov.32 q1[0], r12
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    lsrl r4, r7, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r12
 ; CHECK-NEXT:    vmov r12, s14
-; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r5
 ; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    vmov r7, s13
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov.32 q1[2], r12
-; CHECK-NEXT:    vmov.32 q1[3], r5
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    lsrl r4, r7, #31
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r12
+; CHECK-NEXT:    vmov q1[3], q1[1], r7, r5
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s6, s0
@@ -47,8 +42,8 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
 ; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    bne .LBB0_1
-; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r5, pc}
+; CHECK-NEXT:  .LBB0_2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = and i32 %n, 3
   %cmp = icmp eq i32 %0, 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index 995ac7d88fde..e8979b439a0b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -73,22 +73,19 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q2[1]
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q2[4]
 ; CHECK-NEXT:    vmullb.s16 q0, q3, q0
 ; CHECK-NEXT:    vmov.i32 q3, #0x7fff
 ; CHECK-NEXT:    vshl.i32 q0, q0, #10
@@ -103,22 +100,17 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov.16 q0[2], r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.32 q4[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-NEXT:    vmullb.s16 q1, q2, q4
 ; CHECK-NEXT:    vshl.i32 q1, q1, #10
 ; CHECK-NEXT:    vshr.s32 q1, q1, #10

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
index b8304cf82bea..c7854e7061f3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
@@ -164,57 +164,53 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    mvn r12, #-2147483648
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    mov.w r3, #-1
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    subs.w r2, r2, r12
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    subs.w r3, r3, r12
+; CHECK-NEXT:    mov.w r12, #-1
+; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    adr r1, .LCPI12_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    sbcs.w r1, r12, r1
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
+; CHECK-NEXT:    sbcs.w r2, r12, r2
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-NEXT:    adr r0, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -244,57 +240,53 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r1
-; CHECK-NEXT:    mvn r3, #-2147483648
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    sbcs.w r1, r12, r1
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
+; CHECK-NEXT:    sbcs.w r2, r12, r2
+; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    adr r1, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    subs.w r2, r2, r12
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    subs.w r3, r3, r12
+; CHECK-NEXT:    sbcs r2, r2, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-NEXT:    adr r0, .LCPI13_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -324,29 +316,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_umaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbcs r0, r0, #0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    subs.w r3, r3, #-1
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
@@ -360,29 +350,27 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_uminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbcs r0, r0, #0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    subs.w r3, r3, #-1
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vorr q0, q0, q2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
index c2210925bfe1..6c64285c5fd8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
@@ -180,71 +180,65 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, lr}
-; CHECK-NEXT:    push {r5, lr}
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    mvn lr, #-2147483648
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    asrl r2, r5, #3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    subs.w r0, r2, lr
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    sbcs r0, r5, #0
-; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    mov.w lr, #0
+; CHECK-NEXT:    asrl r2, r1, #3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    subs.w r3, r2, r12
+; CHECK-NEXT:    sbcs r3, r1, #0
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    asrl r4, r3, #3
+; CHECK-NEXT:    subs.w r0, r4, r12
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    sbcs r0, r3, #0
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    asrl r0, r3, #3
-; CHECK-NEXT:    subs.w r1, r0, lr
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    sbcs r1, r3, #0
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    adr r1, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vbic q0, q0, q1
-; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r5
+; CHECK-NEXT:    adr r0, .LCPI12_0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vbic q1, q1, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbcs.w r0, r2, r0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
-; CHECK-NEXT:    sbcs.w r0, r2, r0
+; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r2, r1
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r12, #1
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    movlt.w lr, #1
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    adr r0, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    pop {r5, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI12_0:
@@ -269,71 +263,65 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    mov.w lr, #-1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    mov.w r12, #-1
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    mov.w lr, #0
 ; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    rsbs.w r0, r2, #-2147483648
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    sbcs.w r0, lr, r1
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    mvn r2, #-2147483648
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    rsbs.w r3, r2, #-2147483648
+; CHECK-NEXT:    sbcs.w r3, r12, r1
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #1
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    asrl r0, r3, #3
-; CHECK-NEXT:    rsbs.w r4, r0, #-2147483648
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    sbcs.w r4, lr, r3
-; CHECK-NEXT:    vmov.32 q2[3], r3
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    asrl r4, r3, #3
+; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    sbcs.w r5, r12, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
+; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov.32 q1[2], r4
-; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    adr r4, .LCPI13_0
-; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    vbic q0, q0, q1
-; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    movlt r5, #1
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r0
+; CHECK-NEXT:    adr r0, .LCPI13_0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vbic q1, q1, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    subs r1, r1, r2
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbcs r0, r0, #0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    subs r1, r1, r2
-; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r12, #1
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    movlt.w lr, #1
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    adr r0, .LCPI13_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI13_0:
@@ -358,41 +346,37 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_umaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_umaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, r6, r7, lr}
-; CHECK-NEXT:    push {r5, r6, r7, lr}
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsrl r0, r7, #3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    subs.w r2, r0, #-1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    sbcs r2, r7, #0
-; CHECK-NEXT:    vmov.32 q2[1], r7
-; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
+; CHECK-NEXT:    lsrl r0, r5, #3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    subs.w r3, r0, #-1
+; CHECK-NEXT:    sbcs r3, r5, #0
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    csetm r12, ne
+; CHECK-NEXT:    lsrl r4, r3, #3
+; CHECK-NEXT:    subs.w r1, r4, #-1
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
+; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    lsrl r2, r3, #3
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    subs.w r5, r2, #-1
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    sbcs r5, r3, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vbic q0, q0, q1
-; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    pop {r5, r6, r7, pc}
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
+; CHECK-NEXT:    vbic q1, q1, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c1 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>
@@ -403,41 +387,37 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_uminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_uminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, r6, r7, lr}
-; CHECK-NEXT:    push {r5, r6, r7, lr}
-; CHECK-NEXT:    vmov r7, s1
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    lsrl r0, r7, #3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    subs.w r2, r0, #-1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    sbcs r2, r7, #0
-; CHECK-NEXT:    vmov.32 q2[1], r7
-; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
+; CHECK-NEXT:    lsrl r0, r5, #3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    subs.w r3, r0, #-1
+; CHECK-NEXT:    sbcs r3, r5, #0
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    csetm r12, ne
+; CHECK-NEXT:    lsrl r4, r3, #3
+; CHECK-NEXT:    subs.w r1, r4, #-1
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
+; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    lsrl r2, r3, #3
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    subs.w r5, r2, #-1
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    sbcs r5, r3, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vbic q0, q0, q1
-; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    pop {r5, r6, r7, pc}
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
+; CHECK-NEXT:    vbic q1, q1, q0
+; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c2 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index b815ed24ae26..6416d65f2e1c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -6,21 +6,19 @@
 define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldrd r2, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q1[0], r3
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    ldrd r12, r3, [r0]
+; CHECK-NEXT:    ldrd r2, r0, [r0, #8]
+; CHECK-NEXT:    vmov q0[2], q0[0], r12, r3
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -113,14 +111,12 @@ entry:
 define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v2i16:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrh r2, [r0, #2]
 ; CHECK-NEXT:    ldrh r3, [r0]
-; CHECK-NEXT:    ldrh r2, [r0, #4]
-; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    ldrh.w r12, [r0, #6]
-; CHECK-NEXT:    ldrh r0, [r0, #2]
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    ldrh r0, [r0, #4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -197,13 +193,11 @@ define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vst2_v2i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    ldrb r3, [r0, #2]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    ldrb.w r12, [r0, #1]
-; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    ldrb r3, [r0, #1]
+; CHECK-NEXT:    ldrb.w r12, [r0, #2]
 ; CHECK-NEXT:    ldrb r0, [r0, #3]
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r12, r0
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 1ae74c1738c7..9fcdc6385453 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -8,14 +8,13 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    ldrd lr, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
 ; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.32 q1[3], lr
 ; CHECK-NEXT:    vmov.f32 s8, s7
 ; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov r2, s8
@@ -302,18 +301,16 @@ define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrh r2, [r0, #6]
-; CHECK-NEXT:    ldrh r3, [r0, #4]
+; CHECK-NEXT:    ldrh.w lr, [r0, #4]
 ; CHECK-NEXT:    ldrh.w r12, [r0, #8]
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    ldrh.w lr, [r0, #2]
-; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    ldrh r3, [r0, #2]
+; CHECK-NEXT:    vmov q1[2], q1[0], lr, r2
 ; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    vmov.32 q1[2], r2
 ; CHECK-NEXT:    ldrh r0, [r0, #10]
 ; CHECK-NEXT:    vmov.16 q0[5], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.f32 s3, s2
@@ -689,9 +686,8 @@ define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
 ; CHECK-NEXT:    ldrb r2, [r0]
 ; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    ldrb r3, [r0, #1]
-; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    ldrb.w lr, [r0, #3]
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    ldrb r5, [r0, #5]
@@ -1461,23 +1457,21 @@ entry:
 define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-LABEL: vst3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrd lr, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r12, r2
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q2[0], r3
 ; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s2
@@ -1486,6 +1480,7 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s12, s4
 ; CHECK-NEXT:    vmov.16 q2[4], r0
 ; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vmov.16 q2[5], r0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov.16 q2[6], r0
@@ -1505,7 +1500,7 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
   %l1 = load <4 x half>, <4 x half>* %s1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index bc5415566424..078bdc762dc0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -8,18 +8,16 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    ldrd lr, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
 ; CHECK-NEXT:    vmov.f64 d0, d2
 ; CHECK-NEXT:    vmov.f32 s1, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r3
-; CHECK-NEXT:    vmov.32 q1[2], r12
-; CHECK-NEXT:    vmov.32 q1[3], lr
+; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    vmov.f64 d4, d2
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
@@ -209,23 +207,22 @@ define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    ldrh.w lr, [r0, #4]
+; CHECK-NEXT:    ldrh r2, [r0, #2]
+; CHECK-NEXT:    ldrh.w r12, [r0, #4]
 ; CHECK-NEXT:    ldrh r3, [r0, #8]
-; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    ldrh.w r12, [r0, #6]
-; CHECK-NEXT:    ldrh r2, [r0, #10]
-; CHECK-NEXT:    ldrh r0, [r0, #2]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q0[0], r4
-; CHECK-NEXT:    vmov.16 q0[1], lr
+; CHECK-NEXT:    ldrh.w lr, [r0, #6]
+; CHECK-NEXT:    ldrh r4, [r0, #10]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.16 q0[1], r12
 ; CHECK-NEXT:    vmov.16 q0[2], r3
 ; CHECK-NEXT:    vmov.16 q0[3], r3
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], r12
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.16 q0[7], r2
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], lr
+; CHECK-NEXT:    vmov.16 q0[6], r4
+; CHECK-NEXT:    vmov.16 q0[7], r4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -376,9 +373,8 @@ define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrb r2, [r0]
 ; CHECK-NEXT:    ldrb r3, [r0, #1]
-; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    ldrb.w lr, [r0, #3]
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ldrb r4, [r0, #5]
@@ -911,61 +907,58 @@ entry:
 define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst4_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vmov.32 q0[3], lr
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldrd lr, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
+; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r0
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov.16 q2[0], r3
 ; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vmov.16 q2[4], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NEXT:    vmov.16 q2[5], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s12, s3
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmovx.f16 s12, s4
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s4, s6
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q2[7], r0
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
   %l1 = load <4 x half>, <4 x half>* %s1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
index 6d7fb9e59c03..fe535c641ca7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
@@ -275,45 +275,37 @@ define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i3
 ; CHECK-LABEL: foo_int32_int8_both:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[4]
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r3, q1[5]
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
 ; CHECK-NEXT:    vmovlb.u16 q2, q0
 ; CHECK-NEXT:    vldrb.s16 q0, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
+; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.u16 q1, q2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    vmovlb.u16 q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr


        


More information about the llvm-commits mailing list