[llvm-branch-commits] [llvm] 6e913e4 - Revert "[ARM] Match dual lane vmovs from insert_vector_elt"

David Green via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Dec 18 05:38:19 PST 2020


Author: David Green
Date: 2020-12-18T13:33:40Z
New Revision: 6e913e44519245a79ec098c9b1459007dae84804

URL: https://github.com/llvm/llvm-project/commit/6e913e44519245a79ec098c9b1459007dae84804
DIFF: https://github.com/llvm/llvm-project/commit/6e913e44519245a79ec098c9b1459007dae84804.diff

LOG: Revert "[ARM] Match dual lane vmovs from insert_vector_elt"

This one needed more testing.

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/test/CodeGen/Thumb2/active_lane_mask.ll
    llvm/test/CodeGen/Thumb2/mve-abs.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-neg.ll
    llvm/test/CodeGen/Thumb2/mve-phireg.ll
    llvm/test/CodeGen/Thumb2/mve-pred-and.ll
    llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
    llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
    llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
    llvm/test/CodeGen/Thumb2/mve-pred-not.ll
    llvm/test/CodeGen/Thumb2/mve-pred-or.ll
    llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-sext.ll
    llvm/test/CodeGen/Thumb2/mve-shifts.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vcmp.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
    llvm/test/CodeGen/Thumb2/mve-vcreate.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vdup.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vmulh.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
    llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
    llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll
    llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2d937930d89f..d792240c9ffd 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4790,14 +4790,6 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     }
   }
-  if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) {
-    assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm());
-    if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) ||
-        MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) {
-      ErrInfo = "Incorrect array index for MVE_VMOV_q_rr";
-      return false;
-    }
-  }
   return true;
 }
 

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 8595705e023e..42498be05eea 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5845,41 +5845,6 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
   let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
 }
 
-let Predicates = [HasMVEInt] in {
-  // Double lane moves. There are a number of patterns here. We know that the
-  // insertelt's will be in descending order by index, and need to match the 5
-  // patterns that might contain 2-0 or 3-1 pairs. These are:
-  // 3 2 1 0    -> vmovqrr 31; vmovqrr 20
-  // 3 2 1      -> vmovqrr 31; vmov 2
-  // 3 1        -> vmovqrr 31
-  // 2 1 0      -> vmovqrr 20; vmov 1
-  // 2 0        -> vmovqrr 20
-  // The other potential patterns will be handled by single lane inserts.
-  def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
-                                                        rGPR:$srcA, (i32 0)),
-                                             rGPR:$srcB, (i32 1)),
-                                  rGPR:$srcC, (i32 2)),
-                       rGPR:$srcD, (i32 3)),
-            (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcC, rGPR:$srcA, (i32 2), (i32 0)),
-                           rGPR:$srcD, rGPR:$srcB, (i32 3), (i32 1))>;
-  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
-                                             rGPR:$srcB, (i32 1)),
-                                  rGPR:$srcC, (i32 2)),
-                       rGPR:$srcD, (i32 3)),
-            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)),
-                           rGPR:$srcD, rGPR:$srcB, (i32 3), (i32 1))>;
-  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)),
-            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcB, rGPR:$srcA, (i32 3), (i32 1))>;
-  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
-                                             rGPR:$srcB, (i32 0)),
-                                  rGPR:$srcC, (i32 1)),
-                       rGPR:$srcD, (i32 2)),
-            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)),
-                           rGPR:$srcD, rGPR:$srcB, (i32 2), (i32 0))>;
-  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)),
-            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcB, rGPR:$srcA, (i32 2), (i32 0))>;
-}
-
 // end of coproc mov
 
 // start of MVE interleaving load/store

diff  --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 09f594216e7f..729493163b81 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -39,7 +39,6 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    adr r3, .LCPI1_0
 ; CHECK-NEXT:    vdup.32 q1, r1
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:    ldr r3, [sp, #40]
 ; CHECK-NEXT:    vadd.i32 q2, q0, r1
 ; CHECK-NEXT:    vdup.32 q0, r2
 ; CHECK-NEXT:    vcmp.u32 hi, q1, q2
@@ -47,16 +46,21 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    ldr r2, [sp, #36]
-; CHECK-NEXT:    ldr r3, [sp, #44]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    ldr r2, [sp, #40]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    ldr r2, [sp, #44]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    ldr r2, [sp]
-; CHECK-NEXT:    ldr r3, [sp, #8]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    ldr r2, [sp, #4]
-; CHECK-NEXT:    ldr r3, [sp, #12]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    ldr r2, [sp, #8]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    ldr r2, [sp, #12]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    adr r2, .LCPI1_1
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
@@ -66,19 +70,21 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    vcmp.u32 hi, q1, q2
 ; CHECK-NEXT:    vmrs r1, p0
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    ldr r2, [sp, #56]
 ; CHECK-NEXT:    vmsr p0, r1
-; CHECK-NEXT:    ldr r1, [sp, #52]
+; CHECK-NEXT:    ldr r1, [sp, #48]
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    ldr r1, [sp, #52]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    ldr r1, [sp, #48]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT:    ldr r1, [sp, #56]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    ldr r1, [sp, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    ldr r1, [sp, #20]
-; CHECK-NEXT:    ldr r2, [sp, #24]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    ldr r1, [sp, #24]
+; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s1
@@ -407,75 +413,81 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) {
 ; CHECK-LABEL: test_width2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB4_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    adds r0, r2, #1
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r2
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    bic r0, r0, #1
-; CHECK-NEXT:    adr r2, .LCPI4_0
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    subs r0, #2
-; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vldrw.u32 q2, [r2]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    add.w lr, r3, r0, lsr #1
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    adr r3, .LCPI4_0
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3[2], q3[0], r8, r8
-; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    vmov.32 q3[0], r6
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov.32 q3[2], r6
+; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vand q3, q3, q0
-; CHECK-NEXT:    vmov r6, s5
+; CHECK-NEXT:    adds r6, #2
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add.w r8, r8, #2
-; CHECK-NEXT:    vmov r9, s12
 ; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r9
-; CHECK-NEXT:    vand q3, q3, q0
 ; CHECK-NEXT:    adc r12, r2, #0
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    subs r7, r5, r7
-; CHECK-NEXT:    vmov r7, s14
-; CHECK-NEXT:    sbcs r4, r6
-; CHECK-NEXT:    vmov r6, s15
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    subs r2, r7, r2
-; CHECK-NEXT:    sbcs.w r0, r6, r0
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vand q3, q3, q0
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    teq.w r4, r2
+; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.32 q4[1], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    eors r3, r2
+; CHECK-NEXT:    orrs.w r3, r3, r12
+; CHECK-NEXT:    cset r3, ne
+; CHECK-NEXT:    tst.w r3, #1
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    subs r5, r4, r5
+; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    veor q4, q4, q1
+; CHECK-NEXT:    sbcs.w r0, r3, r0
+; CHECK-NEXT:    vmov r3, s11
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r0, r4
-; CHECK-NEXT:    eor.w r0, r7, r3
-; CHECK-NEXT:    orrs.w r0, r0, r12
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    teq.w r5, r9
-; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    tst.w r2, #1
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r2
-; CHECK-NEXT:    vmov q4[3], q4[1], r0, r2
-; CHECK-NEXT:    veor q4, q4, q2
-; CHECK-NEXT:    vand q4, q4, q3
+; CHECK-NEXT:    subs r2, r2, r5
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    @ implicit-def: $q3
+; CHECK-NEXT:    sbcs r0, r3
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    and r2, r2, #1
@@ -507,9 +519,8 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroe
 ; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  .LBB4_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI4_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll
index 80681d1c878b..8a9b8814ef2e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-abs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll
@@ -42,18 +42,21 @@ define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    adds.w r1, r1, r0, asr #31
-; CHECK-NEXT:    adc.w r12, r0, r0, asr #31
+; CHECK-NEXT:    adc.w r2, r0, r0, asr #31
+; CHECK-NEXT:    eor.w r2, r2, r0, asr #31
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    adds.w r1, r1, r0, asr #31
 ; CHECK-NEXT:    eor.w r1, r1, r0, asr #31
-; CHECK-NEXT:    adds.w r2, r2, r3, asr #31
-; CHECK-NEXT:    eor.w r0, r12, r0, asr #31
-; CHECK-NEXT:    eor.w r2, r2, r3, asr #31
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    adc.w r1, r3, r3, asr #31
-; CHECK-NEXT:    eor.w r1, r1, r3, asr #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    adc.w r1, r0, r0, asr #31
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = icmp slt <2 x i64> %s1, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index ad21dc1b320b..dc1ed2e187fb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -8,17 +8,19 @@ define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    udiv r1, r2, r1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    udiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    udiv r1, r2, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -32,17 +34,19 @@ define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    sdiv r1, r2, r1
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sdiv r0, r1, r0
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    sdiv r1, r2, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    sdiv r0, r1, r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -59,20 +63,22 @@ define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    udiv r2, r1, r0
 ; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    udiv r3, r2, r1
 ; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    udiv r0, r3, r2
 ; CHECK-NEXT:    mls r0, r0, r2, r3
 ; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
+; CHECK-NEXT:    vmov.32 q0[0], r12
 ; CHECK-NEXT:    udiv r1, r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = urem <4 x i32> %in1, %in2
@@ -88,20 +94,22 @@ define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) {
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    sdiv r2, r1, r0
 ; CHECK-NEXT:    mls r12, r2, r0, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    sdiv r3, r2, r1
 ; CHECK-NEXT:    mls lr, r3, r1, r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sdiv r0, r3, r2
 ; CHECK-NEXT:    mls r0, r0, r2, r3
 ; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
+; CHECK-NEXT:    vmov.32 q0[0], r12
 ; CHECK-NEXT:    sdiv r1, r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    mls r1, r1, r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = srem <4 x i32> %in1, %in2
@@ -731,8 +739,8 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: udiv_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
@@ -742,17 +750,20 @@ define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r12, s22
+; CHECK-NEXT:    vmov lr, s23
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    mov r1, lr
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = udiv <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -761,8 +772,8 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: sdiv_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
@@ -772,17 +783,20 @@ define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vmov r12, s22
+; CHECK-NEXT:    vmov lr, s23
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    mov r1, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = sdiv <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -791,8 +805,8 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: urem_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
@@ -802,17 +816,20 @@ define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    vmov r12, s18
+; CHECK-NEXT:    vmov lr, s19
+; CHECK-NEXT:    vmov.32 q4[0], r2
 ; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov.32 q4[1], r3
 ; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    mov r3, lr
 ; CHECK-NEXT:    bl __aeabi_uldivmod
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = urem <2 x i64> %in1, %in2
   ret <2 x i64> %out
@@ -821,8 +838,8 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-LABEL: srem_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
@@ -832,17 +849,20 @@ define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) {
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    vmov r12, s18
+; CHECK-NEXT:    vmov lr, s19
+; CHECK-NEXT:    vmov.32 q4[0], r2
 ; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov.32 q4[1], r3
 ; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    mov r3, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = srem <2 x i64> %in1, %in2
   ret <2 x i64> %out

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 45c5343184c1..0f3a91ca31af 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -589,46 +589,50 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vmov.u16 r7, q2[4]
-; CHECK-NEXT:    vmov.u16 r3, q2[6]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r7
-; CHECK-NEXT:    vmov.u16 r3, q2[5]
-; CHECK-NEXT:    vmov.u16 r7, q2[7]
 ; CHECK-NEXT:    vmov.u16 r5, q2[0]
-; CHECK-NEXT:    vmov q4[3], q4[1], r7, r3
-; CHECK-NEXT:    vmov.u16 r6, q2[2]
+; CHECK-NEXT:    vmov.32 q4[0], r7
+; CHECK-NEXT:    vmov.u16 r7, q2[5]
+; CHECK-NEXT:    vmov.32 q4[1], r7
+; CHECK-NEXT:    vmov.u16 r7, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r7
+; CHECK-NEXT:    vmov.u16 r7, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r7
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmovlb.s16 q4, q4
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r5
-; CHECK-NEXT:    vshl.i32 q4, q4, #1
 ; CHECK-NEXT:    vmov.u16 r5, q2[1]
+; CHECK-NEXT:    vshl.i32 q4, q4, #1
+; CHECK-NEXT:    vmov.32 q3[1], r5
 ; CHECK-NEXT:    vadd.i32 q4, q4, r0
-; CHECK-NEXT:    vmov.u16 r6, q2[3]
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r5
+; CHECK-NEXT:    vmov.u16 r5, q2[2]
+; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.u16 r5, q2[3]
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov.32 q3[3], r5
+; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vmov r7, s17
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
-; CHECK-NEXT:    vadd.i16 q2, q2, q1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
 ; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    vmov r12, s13
-; CHECK-NEXT:    ldrh.w r11, [r3]
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    ldrh r7, [r7]
+; CHECK-NEXT:    ldrh.w r11, [r7]
+; CHECK-NEXT:    vmov r7, s12
+; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh.w r9, [r5]
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    ldrh.w r10, [r6]
 ; CHECK-NEXT:    vmov r6, s19
 ; CHECK-NEXT:    ldrh.w r1, [r12]
-; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov.16 q3[0], r3
+; CHECK-NEXT:    ldrh r7, [r7]
+; CHECK-NEXT:    vmov.16 q3[0], r7
 ; CHECK-NEXT:    vmov.16 q3[1], r1
 ; CHECK-NEXT:    vmov.16 q3[2], r10
-; CHECK-NEXT:    ldrh r5, [r5]
 ; CHECK-NEXT:    vmov.16 q3[3], r9
-; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    vmov.16 q3[4], r11
-; CHECK-NEXT:    vmov.16 q3[5], r7
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    vmov.16 q3[5], r3
+; CHECK-NEXT:    ldrh r6, [r6]
 ; CHECK-NEXT:    vmov.16 q3[6], r5
 ; CHECK-NEXT:    vmov.16 q3[7], r6
 ; CHECK-NEXT:    vstrb.8 q3, [r4], #16
@@ -700,27 +704,26 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    blt.w .LBB12_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    bic r1, r2, #7
+; CHECK-NEXT:    bic r8, r2, #7
 ; CHECK-NEXT:    adr r6, .LCPI12_2
-; CHECK-NEXT:    sub.w r3, r1, #8
+; CHECK-NEXT:    sub.w r3, r8, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
 ; CHECK-NEXT:    movs r7, #1
+; CHECK-NEXT:    vmov.i16 q3, #0x18
 ; CHECK-NEXT:    add.w r1, r7, r3, lsr #3
 ; CHECK-NEXT:    adr r3, .LCPI12_0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    adr r7, .LCPI12_1
-; CHECK-NEXT:    vmov.i16 q3, #0x18
+; CHECK-NEXT:    str r1, [sp, #56] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
-; CHECK-NEXT:    str r1, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
@@ -730,27 +733,41 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vmov.u16 r3, q5[0]
-; CHECK-NEXT:    vmov.u16 r5, q5[2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[1]
-; CHECK-NEXT:    vmov.u16 r5, q5[3]
 ; CHECK-NEXT:    vmov.u16 r7, q7[4]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
-; CHECK-NEXT:    vmov.u16 r5, q5[6]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[2]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[3]
+; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vmov.u16 r12, q6[0]
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov.u16 r12, q7[6]
+; CHECK-NEXT:    vmov.32 q1[0], r12
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vmov.u16 r1, q7[7]
+; CHECK-NEXT:    vmov.u16 r1, q6[1]
 ; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov.u16 r1, q6[2]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q6[3]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q6[4]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    vmov r6, s11
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q4, q1, r0
 ; CHECK-NEXT:    ldrh.w r9, [r3]
 ; CHECK-NEXT:    vmov.u16 r3, q5[4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.u16 r3, q5[5]
-; CHECK-NEXT:    vmov.u16 r5, q5[7]
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[6]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q5[7]
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    ldrh r6, [r6]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
@@ -761,44 +778,42 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    ldrh r5, [r5]
 ; CHECK-NEXT:    ldrh.w r11, [r3]
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r12, r7
+; CHECK-NEXT:    vmov.32 q0[0], r7
 ; CHECK-NEXT:    vmov.u16 r7, q7[5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r7
+; CHECK-NEXT:    vmov.32 q0[1], r7
+; CHECK-NEXT:    vmov.u16 r7, q7[6]
+; CHECK-NEXT:    vmov.32 q0[2], r7
+; CHECK-NEXT:    vmov.u16 r7, q7[7]
+; CHECK-NEXT:    vmov.32 q0[3], r7
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r7, s2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    ldrh.w r8, [r3]
-; CHECK-NEXT:    vmov.u16 r3, q6[2]
-; CHECK-NEXT:    ldrh r7, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q6[0]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q6[1]
-; CHECK-NEXT:    vmov.u16 r3, q6[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q6[4]
-; CHECK-NEXT:    vmov.u16 r3, q6[6]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q6[5]
-; CHECK-NEXT:    vmov.u16 r3, q6[7]
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q6[6]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q6[7]
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q7[0]
-; CHECK-NEXT:    vmov.u16 r3, q7[2]
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
+; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q7[1]
-; CHECK-NEXT:    vmov.u16 r3, q7[3]
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q7[2]
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q7[3]
+; CHECK-NEXT:    vmov.32 q3[3], r1
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vshl.i32 q3, q3, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    ldrh r7, [r7]
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
 ; CHECK-NEXT:    vmov r1, s9
@@ -809,7 +824,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vmov.16 q1[3], r6
 ; CHECK-NEXT:    vmov.16 q1[4], r10
 ; CHECK-NEXT:    vmov.16 q1[5], r11
-; CHECK-NEXT:    vmov.16 q1[6], r8
+; CHECK-NEXT:    vmov.16 q1[6], r3
 ; CHECK-NEXT:    vmov.16 q1[7], r5
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
@@ -867,8 +882,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    cmp r8, r2
 ; CHECK-NEXT:    bne.w .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #104

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
index 67522ee85e87..20e258d46b5e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
@@ -462,15 +462,17 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
 ; CHECK-NEXT:    vmov.i32 q1, #0x10
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    ldr r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    bx lr
 entry:
   %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
index 2738e2c95228..236a695c0a5d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
@@ -63,13 +63,15 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr)
 ; CHECK-NEXT:    ldrb r2, [r1]
 ; CHECK-NEXT:    vmov.i32 q0, #0xff
 ; CHECK-NEXT:    ldrb r1, [r1, #1]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    ldrb r1, [r0, r1]
 ; CHECK-NEXT:    ldrb r0, [r0, r2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i8>, <2 x i8>* %offptr, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 4f16967d1a21..2a86ddbede65 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -9,7 +9,8 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) {
 ; CHECK-NEXT:    ldrd r1, r0, [r0]
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4
@@ -36,26 +37,30 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    ldr.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    ldr.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    ldr r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldr r4, [r4]
+; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldr r4, [r4]
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
@@ -66,53 +71,61 @@ entry:
 define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) {
 ; CHECK-LABEL: ptr_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r6, s6
 ; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r7, s7
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r6, s7
+; CHECK-NEXT:    vmov r4, s11
 ; CHECK-NEXT:    ldr.w r12, [r1]
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r2, [r2]
-; CHECK-NEXT:    ldr r7, [r7]
 ; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r4, [r4]
 ; CHECK-NEXT:    ldr.w lr, [r1]
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    ldr r3, [r1]
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r5
-; CHECK-NEXT:    vmov r6, s5
-; CHECK-NEXT:    vmov r5, s9
+; CHECK-NEXT:    ldr r3, [r1]
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.32 q0[0], r5
+; CHECK-NEXT:    vmov r5, s5
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    ldr r6, [r6]
 ; CHECK-NEXT:    ldr r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    ldr r6, [r6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r0
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r6, s15
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, lr
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT:    ldr r5, [r5]
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    ldr r6, [r6]
-; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r12
+; CHECK-NEXT:    vmov.32 q0[2], r5
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.32 q0[3], r6
+; CHECK-NEXT:    ldr r5, [r5]
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r5
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.32 q3[0], lr
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.32 q2[2], r12
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
   %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@@ -220,10 +233,12 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) {
 ; CHECK-NEXT:    ldrd r1, r0, [r0]
 ; CHECK-NEXT:    ldrsh.w r0, [r0]
 ; CHECK-NEXT:    ldrsh.w r1, [r1]
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
@@ -239,7 +254,8 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) {
 ; CHECK-NEXT:    vmov.i64 q0, #0xffff
 ; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -253,16 +269,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -276,16 +294,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -302,27 +322,31 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -339,27 +363,31 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    ldrh.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    ldrh.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    ldrh r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r0, [r0]
 ; CHECK-NEXT:    ldrh r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -566,15 +594,17 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    bx lr
@@ -590,16 +620,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -616,28 +648,32 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    ldrb.w lr, [r2]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r0, [r0]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r2, [r2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -655,28 +691,32 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    ldrb.w lr, [r0]
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    ldrb.w lr, [r2]
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    ldrb r0, [r0]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -803,30 +843,34 @@ define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base,
 ; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s3
 ; CHECK-NEXT:    ldrb.w r12, [r2]
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ldrb.w lr, [r3]
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    ldrb r0, [r0]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    ldrb r2, [r2]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r3
-; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8>, <8 x i8>* %offptr, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
index 8d146d1dbc84..17b28811fd00 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
@@ -8,15 +8,17 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>*
 ; NOGATSCAT-NEXT:    vldrw.u32 q0, [r1]
 ; NOGATSCAT-NEXT:    vadd.i32 q0, q0, r0
 ; NOGATSCAT-NEXT:    vmov r0, s0
+; NOGATSCAT-NEXT:    vmov r3, s1
 ; NOGATSCAT-NEXT:    vmov r1, s2
-; NOGATSCAT-NEXT:    vmov r2, s1
-; NOGATSCAT-NEXT:    vmov r3, s3
+; NOGATSCAT-NEXT:    vmov r2, s3
 ; NOGATSCAT-NEXT:    ldr r0, [r0]
+; NOGATSCAT-NEXT:    ldr r3, [r3]
+; NOGATSCAT-NEXT:    vmov.32 q0[0], r0
 ; NOGATSCAT-NEXT:    ldr r1, [r1]
+; NOGATSCAT-NEXT:    vmov.32 q0[1], r3
 ; NOGATSCAT-NEXT:    ldr r2, [r2]
-; NOGATSCAT-NEXT:    ldr r3, [r3]
-; NOGATSCAT-NEXT:    vmov q0[2], q0[0], r1, r0
-; NOGATSCAT-NEXT:    vmov q0[3], q0[1], r3, r2
+; NOGATSCAT-NEXT:    vmov.32 q0[2], r1
+; NOGATSCAT-NEXT:    vmov.32 q0[3], r2
 ; NOGATSCAT-NEXT:    bx lr
 ;
 ; NOMVE-LABEL: unscaled_i32_i32_gather:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index 7b2343a0f905..a3f06e5d2537 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -93,61 +93,64 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r5, lr
-; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q1
+; CHECK-LE-NEXT:    movs r4, #0
+; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
+; CHECK-LE-NEXT:    vmov.32 q0[0], lr
+; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT:    mov.w lr, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r4, r5, #0
-; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    movlt.w lr, #1
+; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r3, #1
-; CHECK-LE-NEXT:    cmp r3, #0
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    cmp r1, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r3, #1
-; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    vmov r4, s4
-; CHECK-LE-NEXT:    and r12, r3, #3
-; CHECK-LE-NEXT:    lsls r1, r3, #31
+; CHECK-LE-NEXT:    mvnne r1, #1
+; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
+; CHECK-LE-NEXT:    vmov.32 q0[2], r12
+; CHECK-LE-NEXT:    and r3, r1, #3
+; CHECK-LE-NEXT:    lsls r1, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
-; CHECK-LE-NEXT:    lsls.w r1, r12, #30
+; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
+; CHECK-LE-NEXT:    lsls r1, r3, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT:    vmov r3, s2
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmov r1, s0
-; CHECK-LE-NEXT:    vmov q0[2], q0[0], r3, r1
-; CHECK-LE-NEXT:    rsbs r5, r4, #0
+; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov r2, s0
+; CHECK-LE-NEXT:    vmov r3, s4
+; CHECK-LE-NEXT:    vmov r1, s6
+; CHECK-LE-NEXT:    vmov.32 q1[0], r3
+; CHECK-LE-NEXT:    rsbs r5, r2, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT:    vmov r2, s2
 ; CHECK-LE-NEXT:    asr.w lr, r3, #31
-; CHECK-LE-NEXT:    vmov r3, s6
+; CHECK-LE-NEXT:    vmov.32 q1[1], lr
 ; CHECK-LE-NEXT:    asr.w r12, r1, #31
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r4, asr #31
+; CHECK-LE-NEXT:    vmov.32 q1[2], r1
 ; CHECK-LE-NEXT:    mov.w r1, #0
-; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r5, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
+; CHECK-LE-NEXT:    vmov.32 q1[3], r12
+; CHECK-LE-NEXT:    rsbs r3, r2, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r2, #1
-; CHECK-LE-NEXT:    cmp r2, #0
+; CHECK-LE-NEXT:    movlt r4, #1
+; CHECK-LE-NEXT:    cmp r4, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r2, #3
-; CHECK-LE-NEXT:    lsls r2, r2, #31
+; CHECK-LE-NEXT:    mvnne r4, #1
+; CHECK-LE-NEXT:    bfi r4, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r4, #3
+; CHECK-LE-NEXT:    lsls r2, r4, #31
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne d0, [r0]
+; CHECK-LE-NEXT:    vstrne d2, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
+; CHECK-LE-NEXT:    vstrmi d3, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
 ;
@@ -161,7 +164,9 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT:    vmov.32 q0[1], r12
+; CHECK-BE-NEXT:    @ implicit-def: $q2
+; CHECK-BE-NEXT:    vmov.32 q0[3], lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -173,7 +178,6 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
-; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB5_2
@@ -195,15 +199,17 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vmov r2, s11
 ; CHECK-BE-NEXT:    movs r4, #0
-; CHECK-BE-NEXT:    vmov r1, s1
-; CHECK-BE-NEXT:    vmov r3, s3
+; CHECK-BE-NEXT:    vmov r3, s1
+; CHECK-BE-NEXT:    vmov r1, s3
 ; CHECK-BE-NEXT:    rsbs r5, r2, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-BE-NEXT:    vmov r2, s9
-; CHECK-BE-NEXT:    asr.w r12, r1, #31
 ; CHECK-BE-NEXT:    asr.w lr, r3, #31
-; CHECK-BE-NEXT:    vmov q1[2], q1[0], lr, r12
-; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r1
+; CHECK-BE-NEXT:    vmov.32 q1[0], lr
+; CHECK-BE-NEXT:    asr.w r12, r1, #31
+; CHECK-BE-NEXT:    vmov.32 q1[1], r3
+; CHECK-BE-NEXT:    vmov.32 q1[2], r12
+; CHECK-BE-NEXT:    vmov.32 q1[3], r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
@@ -241,62 +247,65 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
-; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r5, lr
-; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q1
+; CHECK-LE-NEXT:    movs r4, #0
+; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
+; CHECK-LE-NEXT:    vmov.32 q0[0], lr
+; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT:    mov.w lr, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r4, r5, #0
-; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    movlt.w lr, #1
+; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r3, #1
-; CHECK-LE-NEXT:    cmp r3, #0
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    cmp r1, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r3, #1
-; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    vmov r4, s4
-; CHECK-LE-NEXT:    and r12, r3, #3
-; CHECK-LE-NEXT:    lsls r1, r3, #31
+; CHECK-LE-NEXT:    mvnne r1, #1
+; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
+; CHECK-LE-NEXT:    vmov.32 q0[2], r12
+; CHECK-LE-NEXT:    and r3, r1, #3
+; CHECK-LE-NEXT:    lsls r1, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
-; CHECK-LE-NEXT:    lsls.w r1, r12, #30
+; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
+; CHECK-LE-NEXT:    lsls r1, r3, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT:    vmov r3, s2
-; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vmov r1, s0
-; CHECK-LE-NEXT:    vmov q0[2], q0[0], r3, r1
-; CHECK-LE-NEXT:    rsbs r5, r4, #0
+; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov r2, s0
+; CHECK-LE-NEXT:    vmov r3, s4
+; CHECK-LE-NEXT:    vmov r1, s6
+; CHECK-LE-NEXT:    vmov.32 q1[0], r3
+; CHECK-LE-NEXT:    rsbs r5, r2, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT:    vmov r2, s2
 ; CHECK-LE-NEXT:    asr.w lr, r3, #31
-; CHECK-LE-NEXT:    vmov r3, s6
+; CHECK-LE-NEXT:    vmov.32 q1[1], lr
 ; CHECK-LE-NEXT:    asr.w r12, r1, #31
-; CHECK-LE-NEXT:    sbcs.w r1, r2, r4, asr #31
+; CHECK-LE-NEXT:    vmov.32 q1[2], r1
 ; CHECK-LE-NEXT:    mov.w r1, #0
-; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r5, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
+; CHECK-LE-NEXT:    vmov.32 q1[3], r12
+; CHECK-LE-NEXT:    rsbs r3, r2, #0
+; CHECK-LE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r2, #1
-; CHECK-LE-NEXT:    cmp r2, #0
+; CHECK-LE-NEXT:    movlt r4, #1
+; CHECK-LE-NEXT:    cmp r4, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
-; CHECK-LE-NEXT:    and r1, r2, #3
-; CHECK-LE-NEXT:    lsls r2, r2, #31
+; CHECK-LE-NEXT:    mvnne r4, #1
+; CHECK-LE-NEXT:    bfi r4, r1, #0, #1
+; CHECK-LE-NEXT:    and r1, r4, #3
+; CHECK-LE-NEXT:    lsls r2, r4, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, r3, d0
+; CHECK-LE-NEXT:    vmovne r2, r3, d2
 ; CHECK-LE-NEXT:    strdne r2, r3, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, r2, d1
+; CHECK-LE-NEXT:    vmovmi r1, r2, d3
 ; CHECK-LE-NEXT:    strdmi r1, r2, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
@@ -311,7 +320,9 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT:    vmov.32 q0[1], r12
+; CHECK-BE-NEXT:    @ implicit-def: $q2
+; CHECK-BE-NEXT:    vmov.32 q0[3], lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -323,7 +334,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
-; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB6_2
@@ -345,15 +355,17 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vmov r2, s11
 ; CHECK-BE-NEXT:    movs r4, #0
-; CHECK-BE-NEXT:    vmov r1, s1
-; CHECK-BE-NEXT:    vmov r3, s3
+; CHECK-BE-NEXT:    vmov r3, s1
+; CHECK-BE-NEXT:    vmov r1, s3
 ; CHECK-BE-NEXT:    rsbs r5, r2, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r4, r2, asr #31
 ; CHECK-BE-NEXT:    vmov r2, s9
-; CHECK-BE-NEXT:    asr.w r12, r1, #31
 ; CHECK-BE-NEXT:    asr.w lr, r3, #31
-; CHECK-BE-NEXT:    vmov q1[2], q1[0], lr, r12
-; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r1
+; CHECK-BE-NEXT:    vmov.32 q1[0], lr
+; CHECK-BE-NEXT:    asr.w r12, r1, #31
+; CHECK-BE-NEXT:    vmov.32 q1[1], r3
+; CHECK-BE-NEXT:    vmov.32 q1[2], r12
+; CHECK-BE-NEXT:    vmov.32 q1[3], r1
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt r1, #1
@@ -389,63 +401,64 @@ entry:
 define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
 ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q1
 ; CHECK-LE-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r5, lr
-; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
+; CHECK-LE-NEXT:    vmov.32 q0[0], lr
+; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT:    mov.w lr, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r4, r5, #0
-; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    movlt.w lr, #1
+; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r3, #1
-; CHECK-LE-NEXT:    cmp r3, #0
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    cmp r1, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r3, #1
-; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    and r12, r3, #3
-; CHECK-LE-NEXT:    lsls r1, r3, #31
+; CHECK-LE-NEXT:    mvnne r1, #1
+; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
+; CHECK-LE-NEXT:    vmov.32 q0[2], r12
+; CHECK-LE-NEXT:    and r3, r1, #3
+; CHECK-LE-NEXT:    mov.w r12, #0
+; CHECK-LE-NEXT:    lsls r1, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
-; CHECK-LE-NEXT:    lsls.w r1, r12, #30
+; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
+; CHECK-LE-NEXT:    lsls r1, r3, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT:    vmov r1, s4
+; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov r1, s0
 ; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vand q0, q0, q2
+; CHECK-LE-NEXT:    vand q1, q1, q2
 ; CHECK-LE-NEXT:    rsbs r3, r1, #0
-; CHECK-LE-NEXT:    vmov r3, s6
+; CHECK-LE-NEXT:    vmov r3, s2
 ; CHECK-LE-NEXT:    sbcs.w r1, r2, r1, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r5, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
+; CHECK-LE-NEXT:    movlt.w r12, #1
+; CHECK-LE-NEXT:    rsbs r1, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
+; CHECK-LE-NEXT:    bfi r2, r12, #0, #1
 ; CHECK-LE-NEXT:    and r1, r2, #3
 ; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne d0, [r0]
+; CHECK-LE-NEXT:    vstrne d2, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
+; CHECK-LE-NEXT:    vstrmi d3, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT:    pop {r7, pc}
 ;
 ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
@@ -457,7 +470,9 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT:    vmov.32 q0[1], r12
+; CHECK-BE-NEXT:    @ implicit-def: $q1
+; CHECK-BE-NEXT:    vmov.32 q0[3], lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -469,7 +484,6 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
-; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB7_2
@@ -527,65 +541,66 @@ entry:
 define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
 ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-LE-NEXT:    push {r4, r5, r7, lr}
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
-; CHECK-LE-NEXT:    ldrd lr, r5, [r1]
-; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    ldrd lr, r12, [r1]
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    @ implicit-def: $q1
 ; CHECK-LE-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-LE-NEXT:    rsbs.w r1, lr, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r5, lr
-; CHECK-LE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
+; CHECK-LE-NEXT:    rsbs.w r3, lr, #0
+; CHECK-LE-NEXT:    vmov.32 q0[0], lr
+; CHECK-LE-NEXT:    sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT:    mov.w lr, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r4, r5, #0
-; CHECK-LE-NEXT:    sbcs.w r4, r3, r5, asr #31
+; CHECK-LE-NEXT:    movlt.w lr, #1
+; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-LE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r3, #1
-; CHECK-LE-NEXT:    cmp r3, #0
+; CHECK-LE-NEXT:    movlt r1, #1
+; CHECK-LE-NEXT:    cmp r1, #0
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    mvnne r3, #1
-; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    and r12, r3, #3
-; CHECK-LE-NEXT:    lsls r1, r3, #31
+; CHECK-LE-NEXT:    mvnne r1, #1
+; CHECK-LE-NEXT:    bfi r1, lr, #0, #1
+; CHECK-LE-NEXT:    vmov.32 q0[2], r12
+; CHECK-LE-NEXT:    and r3, r1, #3
+; CHECK-LE-NEXT:    mov.w r12, #0
+; CHECK-LE-NEXT:    lsls r1, r1, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
-; CHECK-LE-NEXT:    lsls.w r1, r12, #30
+; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
+; CHECK-LE-NEXT:    lsls r1, r3, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT:    vmov r1, s4
+; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov r1, s0
 ; CHECK-LE-NEXT:    movs r2, #0
-; CHECK-LE-NEXT:    vand q0, q0, q2
+; CHECK-LE-NEXT:    vand q1, q1, q2
 ; CHECK-LE-NEXT:    rsbs r3, r1, #0
-; CHECK-LE-NEXT:    vmov r3, s6
+; CHECK-LE-NEXT:    vmov r3, s2
 ; CHECK-LE-NEXT:    sbcs.w r1, r2, r1, asr #31
-; CHECK-LE-NEXT:    mov.w r1, #0
 ; CHECK-LE-NEXT:    it lt
-; CHECK-LE-NEXT:    movlt r1, #1
-; CHECK-LE-NEXT:    rsbs r5, r3, #0
-; CHECK-LE-NEXT:    sbcs.w r3, r2, r3, asr #31
+; CHECK-LE-NEXT:    movlt.w r12, #1
+; CHECK-LE-NEXT:    rsbs r1, r3, #0
+; CHECK-LE-NEXT:    sbcs.w r1, r2, r3, asr #31
 ; CHECK-LE-NEXT:    it lt
 ; CHECK-LE-NEXT:    movlt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    it ne
 ; CHECK-LE-NEXT:    mvnne r2, #1
-; CHECK-LE-NEXT:    bfi r2, r1, #0, #1
+; CHECK-LE-NEXT:    bfi r2, r12, #0, #1
 ; CHECK-LE-NEXT:    and r1, r2, #3
 ; CHECK-LE-NEXT:    lsls r2, r2, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, r3, d0
+; CHECK-LE-NEXT:    vmovne r2, r3, d2
 ; CHECK-LE-NEXT:    strdne r2, r3, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, r2, d1
+; CHECK-LE-NEXT:    vmovmi r1, r2, d3
 ; CHECK-LE-NEXT:    strdmi r1, r2, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
-; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-LE-NEXT:    pop {r7, pc}
 ;
 ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned:
 ; CHECK-BE:       @ %bb.0: @ %entry
@@ -597,7 +612,9 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    rsbs.w r1, lr, #0
 ; CHECK-BE-NEXT:    mov.w r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r1, r3, lr, asr #31
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT:    vmov.32 q0[1], r12
+; CHECK-BE-NEXT:    @ implicit-def: $q1
+; CHECK-BE-NEXT:    vmov.32 q0[3], lr
 ; CHECK-BE-NEXT:    mov.w lr, #0
 ; CHECK-BE-NEXT:    it lt
 ; CHECK-BE-NEXT:    movlt.w lr, #1
@@ -609,7 +626,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 ; CHECK-BE-NEXT:    it ne
 ; CHECK-BE-NEXT:    mvnne r3, #1
 ; CHECK-BE-NEXT:    bfi r3, lr, #0, #1
-; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    and r1, r3, #3
 ; CHECK-BE-NEXT:    lsls r3, r3, #31
 ; CHECK-BE-NEXT:    beq .LBB8_2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index 9f56367598d4..c533216127bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -40,17 +40,17 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov lr, s3
-; CHECK-NEXT:    subs r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    sbcs.w r1, r1, r12
 ; CHECK-NEXT:    vmov r12, s7
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov lr, s1
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    sbcs.w r1, r1, r12
+; CHECK-NEXT:    vmov r12, s5
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
@@ -62,8 +62,10 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -112,17 +114,17 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov lr, s3
-; CHECK-NEXT:    subs r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    sbcs.w r1, r1, r12
 ; CHECK-NEXT:    vmov r12, s7
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov lr, s1
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    sbcs.w r1, r1, r12
+; CHECK-NEXT:    vmov r12, s5
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r1, #1
@@ -134,8 +136,10 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -185,17 +189,17 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov lr, s7
-; CHECK-NEXT:    subs r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    sbcs.w r1, r1, r12
 ; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov lr, s5
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sbcs.w r1, r1, r12
+; CHECK-NEXT:    vmov r12, s1
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
@@ -207,8 +211,10 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -257,17 +263,17 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov lr, s7
-; CHECK-NEXT:    subs r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    sbcs.w r1, r1, r12
 ; CHECK-NEXT:    vmov r12, s3
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov lr, s5
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    sbcs.w r1, r1, r12
+; CHECK-NEXT:    vmov r12, s1
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r1, #1
@@ -279,8 +285,10 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) {
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -369,12 +377,12 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, r1, d8
-; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r2, r3, d11
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    vmov r12, r1, d9
+; CHECK-NEXT:    vmov r12, r1, d8
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    vmov r2, r3, d10
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #1
 ; CHECK-NEXT:    cmp r0, #0
@@ -386,8 +394,10 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do
 ; CHECK-NEXT:    movne r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q0[2], r4
+; CHECK-NEXT:    vmov.32 q0[3], r4
 ; CHECK-NEXT:    vbic q1, q5, q0
 ; CHECK-NEXT:    vand q0, q4, q0
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-neg.ll b/llvm/test/CodeGen/Thumb2/mve-neg.ll
index f3f3fea81a17..2d8d0f4ac519 100644
--- a/llvm/test/CodeGen/Thumb2/mve-neg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-neg.ll
@@ -34,17 +34,19 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @neg_v2i64(<2 x i64> %s1) {
 ; CHECK-LABEL: neg_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    sbc.w r0, r12, r0
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sbc.w r3, r12, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r0
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <2 x i64> zeroinitializer, %s1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index b08f88449d7d..0fe26fbc4753 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -149,60 +149,67 @@ define dso_local i32 @e() #0 {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #392
 ; CHECK-NEXT:    sub sp, #392
-; CHECK-NEXT:    movw r10, :lower16:.L_MergedGlobals
+; CHECK-NEXT:    movw r9, :lower16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s0, .LCPI1_0
-; CHECK-NEXT:    movt r10, :upper16:.L_MergedGlobals
+; CHECK-NEXT:    movt r9, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s3, .LCPI1_1
-; CHECK-NEXT:    mov r6, r10
-; CHECK-NEXT:    mov r7, r10
-; CHECK-NEXT:    ldr r1, [r6, #4]!
-; CHECK-NEXT:    movw r5, :lower16:e
-; CHECK-NEXT:    ldr r0, [r7, #8]!
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov s1, r6
-; CHECK-NEXT:    movt r5, :upper16:e
-; CHECK-NEXT:    vmov q1[2], q1[0], r7, r7
-; CHECK-NEXT:    vmov s9, r5
-; CHECK-NEXT:    vdup.32 q4, r6
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    mov r7, r9
+; CHECK-NEXT:    ldr r1, [r5, #8]!
+; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    ldr r0, [r7, #4]!
+; CHECK-NEXT:    movw r4, :lower16:e
+; CHECK-NEXT:    vmov.32 q4[0], r5
+; CHECK-NEXT:    movt r4, :upper16:e
+; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmov s1, r7
+; CHECK-NEXT:    vmov.32 q1[1], r6
+; CHECK-NEXT:    vmov.32 q5[0], r7
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov s9, r4
+; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vdup.32 q6, r7
+; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
+; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vmov.32 q1[1], r7
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
-; CHECK-NEXT:    vmov q3, q4
-; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmov.32 q1[2], r6
+; CHECK-NEXT:    vmov q3, q6
+; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r6
 ; CHECK-NEXT:    mov.w r8, #4
-; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r6
-; CHECK-NEXT:    vmov.32 q3[0], r5
-; CHECK-NEXT:    vmov.32 q5[1], r5
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    str r1, [sp, #24]
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    vmov.32 q3[0], r4
+; CHECK-NEXT:    vmov.32 q7[1], r4
 ; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    vmov.f32 s11, s3
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    strh.w r8, [sp, #390]
+; CHECK-NEXT:    strd r0, r10, [sp, #24]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #44]
-; CHECK-NEXT:    str.w r9, [sp, #28]
-; CHECK-NEXT:    vstrw.32 q2, [r0]
 ; CHECK-NEXT:    str r0, [r0]
-; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vstrw.32 q7, [r0]
 ; CHECK-NEXT:    vstrw.32 q3, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bl __aeabi_memclr4
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r7
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r6
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r5
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r7
-; CHECK-NEXT:    vmov.32 q4[0], r9
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    str.w r9, [r10]
+; CHECK-NEXT:    vmov.32 q5[1], r5
+; CHECK-NEXT:    vmov.32 q4[1], r4
+; CHECK-NEXT:    vmov.32 q5[2], r7
+; CHECK-NEXT:    vmov.32 q4[2], r7
+; CHECK-NEXT:    vmov.32 q5[3], r6
+; CHECK-NEXT:    vmov.32 q6[0], r10
+; CHECK-NEXT:    vmov.32 q4[3], r5
+; CHECK-NEXT:    str.w r10, [r9]
 ; CHECK-NEXT:    vstrw.32 q4, [r0]
-; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q5, [r0]
 ; CHECK-NEXT:    str.w r8, [sp, #308]
 ; CHECK-NEXT:  .LBB1_1: @ %for.cond
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
index 49cc4caadab6..9848a56b9f33 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll
@@ -577,20 +577,22 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    vorr q2, q0, q1
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vbic q1, q1, q2
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q1, q1, q3
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vorr q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -607,40 +609,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vmov r1, s7
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vand q2, q3, q2
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -661,33 +667,37 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s7
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
index ebe39e87ef48..c7e553fa3510 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
@@ -13,12 +13,14 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vmsr p0, r0
 ; CHECK-LE-NEXT:    vpsel q1, q2, q1
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-LE-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-LE-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-LE-NEXT:    vmov.32 q2[0], r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-LE-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-LE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    add sp, #4
@@ -34,14 +36,16 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-BE-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-BE-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-BE-NEXT:    vmov.32 q2[0], r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-BE-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    add sp, #4
@@ -175,11 +179,13 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    and r1, r0, #2
 ; CHECK-LE-NEXT:    and r0, r0, #1
-; CHECK-LE-NEXT:    movs r2, #0
 ; CHECK-LE-NEXT:    rsbs r0, r0, #0
+; CHECK-LE-NEXT:    movs r2, #0
+; CHECK-LE-NEXT:    vmov.32 q1[0], r0
 ; CHECK-LE-NEXT:    sub.w r1, r2, r1, lsr #1
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-LE-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-LE-NEXT:    vmov.32 q1[1], r0
+; CHECK-LE-NEXT:    vmov.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov.32 q1[3], r1
 ; CHECK-LE-NEXT:    vand q0, q0, q1
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    bx lr
@@ -189,12 +195,14 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    and r1, r0, #2
-; CHECK-BE-NEXT:    and r0, r0, #1
 ; CHECK-BE-NEXT:    movs r2, #0
-; CHECK-BE-NEXT:    rsbs r0, r0, #0
+; CHECK-BE-NEXT:    and r0, r0, #1
 ; CHECK-BE-NEXT:    sub.w r1, r2, r1, lsr #1
-; CHECK-BE-NEXT:    vmov q1[2], q1[0], r0, r1
-; CHECK-BE-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-BE-NEXT:    rsbs r0, r0, #0
+; CHECK-BE-NEXT:    vmov.32 q1[0], r1
+; CHECK-BE-NEXT:    vmov.32 q1[1], r1
+; CHECK-BE-NEXT:    vmov.32 q1[2], r0
+; CHECK-BE-NEXT:    vmov.32 q1[3], r0
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vand q0, q0, q2
 ; CHECK-BE-NEXT:    add sp, #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index a6e0068eff24..b88576a22cc2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -46,12 +46,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: sext_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    sbcs.w r0, r2, r0
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
@@ -64,8 +64,10 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-NEXT:    movlt r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp sgt <2 x i64> %src, zeroinitializer
@@ -119,14 +121,14 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: zext_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    adr r1, .LCPI7_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    sbcs.w r1, r0, r1
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
@@ -139,7 +141,8 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
@@ -198,14 +201,16 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @trunc_v2i1_v2i64(<2 x i64> %src) {
 ; CHECK-LABEL: trunc_v2i1_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    and r1, r1, #1
-; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
index 1250b685e790..df6a38f2f981 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
@@ -11,12 +11,14 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vmsr p0, r0
 ; CHECK-LE-NEXT:    vpsel q1, q2, q1
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-LE-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-LE-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-LE-NEXT:    vmov.32 q2[0], r0
 ; CHECK-LE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-LE-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[1], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-LE-NEXT:    vmov.32 q2[2], r0
+; CHECK-LE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-LE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-LE-NEXT:    vmov.i32 q1, #0x0
-; CHECK-LE-NEXT:    vmov q2[3], q2[1], r1, r0
 ; CHECK-LE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-LE-NEXT:    vpsel q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
@@ -29,14 +31,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-BE-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-BE-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-BE-NEXT:    vmov.32 q2[0], r0
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-BE-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[1], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-BE-NEXT:    vmov.32 q2[2], r0
+; CHECK-BE-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-BE-NEXT:    vmov.32 q2[3], r0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    bx lr
@@ -141,11 +145,13 @@ define arm_aapcs_vfpcc <2 x i64> @load_v2i1(<2 x i1> *%src, <2 x i64> %a) {
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    ldrb r0, [r0]
 ; CHECK-LE-NEXT:    and r1, r0, #1
-; CHECK-LE-NEXT:    ubfx r0, r0, #1, #1
 ; CHECK-LE-NEXT:    rsbs r1, r1, #0
+; CHECK-LE-NEXT:    ubfx r0, r0, #1, #1
+; CHECK-LE-NEXT:    vmov.32 q1[0], r1
 ; CHECK-LE-NEXT:    rsbs r0, r0, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r0, r1
-; CHECK-LE-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-LE-NEXT:    vmov.32 q1[1], r1
+; CHECK-LE-NEXT:    vmov.32 q1[2], r0
+; CHECK-LE-NEXT:    vmov.32 q1[3], r0
 ; CHECK-LE-NEXT:    vand q0, q0, q1
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -156,8 +162,10 @@ define arm_aapcs_vfpcc <2 x i64> @load_v2i1(<2 x i1> *%src, <2 x i64> %a) {
 ; CHECK-BE-NEXT:    and r0, r0, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
 ; CHECK-BE-NEXT:    rsbs r0, r0, #0
-; CHECK-BE-NEXT:    vmov q1[2], q1[0], r0, r1
-; CHECK-BE-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-BE-NEXT:    vmov.32 q1[0], r1
+; CHECK-BE-NEXT:    vmov.32 q1[1], r1
+; CHECK-BE-NEXT:    vmov.32 q1[2], r0
+; CHECK-BE-NEXT:    vmov.32 q1[3], r0
 ; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    vand q0, q0, q2
 ; CHECK-BE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
index 3bbda5cfb5cb..35fb1bfd6977 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
@@ -325,18 +325,20 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vbic q0, q0, q2
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vorr q0, q1, q0
@@ -353,18 +355,20 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vbic q0, q0, q2
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vorr q0, q1, q0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
index 7883bb51bdab..fbc268fa9300 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll
@@ -379,32 +379,36 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vorr q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -424,40 +428,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vmov r1, s7
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vorr q2, q3, q2
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vorr q2, q2, q3
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index 805d93832388..f6d3bafc1f01 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -316,11 +316,13 @@ define <4 x i32> @shuffle5_b_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -347,11 +349,13 @@ define <4 x i32> @shuffle5_t_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
index 0173142ef4c7..f92a4bd958f9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
@@ -459,32 +459,36 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    veor q2, q3, q2
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
@@ -504,40 +508,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s11
+; CHECK-NEXT:    vmov r1, s7
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    veor q2, q3, q2
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    veor q2, q2, q3
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index cd447218ae78..4579c2714b97 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -20,7 +20,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
@@ -32,65 +32,69 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w r10, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r6, r1, r3, lsl #2
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    mvn r10, #-2147483648
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r5, [r0]
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    ldrd r7, r8, [r1]
+; CHECK-NEXT:    ldrd r4, r8, [r0]
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    smull r6, r5, r8, r5
+; CHECK-NEXT:    ldrd r7, r5, [r1]
 ; CHECK-NEXT:    adds r1, #8
+; CHECK-NEXT:    smull r8, r5, r5, r8
 ; CHECK-NEXT:    smull r4, r7, r7, r4
-; CHECK-NEXT:    asrl r6, r5, #31
+; CHECK-NEXT:    asrl r8, r5, #31
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
-; CHECK-NEXT:    vmov q4[2], q4[0], r6, r4
-; CHECK-NEXT:    sbcs r3, r7
-; CHECK-NEXT:    vmov q4[3], q4[1], r5, r7
+; CHECK-NEXT:    vmov.32 q4[0], r4
+; CHECK-NEXT:    mov.w r9, #-1
+; CHECK-NEXT:    sbcs.w r3, r9, r7
 ; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    vmov.32 q4[1], r7
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r9, ne
-; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    mvn r6, #-2147483648
-; CHECK-NEXT:    sbcs r3, r5
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q4[2], r8
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q4[3], r5
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    rsbs.w r3, r8, #-2147483648
+; CHECK-NEXT:    sbcs.w r3, r9, r5
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r9
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r9
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vbic q3, q0, q2
 ; CHECK-NEXT:    vand q2, q4, q2
 ; CHECK-NEXT:    vorr q2, q2, q3
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    subs r4, r4, r6
-; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    subs.w r4, r4, r10
 ; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs r5, r5, r6
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.32 q3[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    subs.w r4, r4, r10
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vbic q4, q1, q3
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vorr q2, q2, q4
@@ -112,7 +116,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r12], #4
-; CHECK-NEXT:    ldr r4, [r10], #4
+; CHECK-NEXT:    ldr r4, [r6], #4
 ; CHECK-NEXT:    smull r4, r3, r4, r3
 ; CHECK-NEXT:    asrl r4, r3, #31
 ; CHECK-NEXT:    subs r5, r1, r4
@@ -225,141 +229,149 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB1_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB1_3
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r1, r9
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    subs r7, r3, #4
+; CHECK-NEXT:    bic r7, r3, #3
 ; CHECK-NEXT:    adr r4, .LCPI1_0
-; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    subs r1, r7, #4
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
+; CHECK-NEXT:    str r7, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adr r4, .LCPI1_1
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    add.w r11, r2, r7, lsl #2
+; CHECK-NEXT:    add.w r1, r9, r7, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r7, lsl #2
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    mov.w r10, #-1
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
-; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    mov.w r2, #-1
+; CHECK-NEXT:    vldrw.u32 q3, [r9], #16
 ; CHECK-NEXT:    vmov.f32 s16, s10
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    vmov r7, s25
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov r10, s26
+; CHECK-NEXT:    vmov r8, s26
 ; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
-; CHECK-NEXT:    vmov.f32 s10, s9
-; CHECK-NEXT:    sbcs.w r5, r2, r7
+; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    sbcs.w r5, r10, r7
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q4[0], r5
+; CHECK-NEXT:    vmov.32 q4[1], r5
 ; CHECK-NEXT:    vmov r5, s27
-; CHECK-NEXT:    csetm r8, ne
-; CHECK-NEXT:    asrl r10, r5, #31
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
-; CHECK-NEXT:    vmov q6[2], q6[0], r10, r4
-; CHECK-NEXT:    sbcs.w r3, r2, r5
-; CHECK-NEXT:    vmov q6[3], q6[1], r5, r7
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    asrl r8, r5, #31
+; CHECK-NEXT:    vmov.32 q6[0], r4
+; CHECK-NEXT:    rsbs.w r6, r8, #-2147483648
+; CHECK-NEXT:    vmov.32 q6[1], r7
+; CHECK-NEXT:    sbcs.w r6, r10, r5
+; CHECK-NEXT:    vmov.32 q6[2], r8
+; CHECK-NEXT:    mov.w r6, #0
+; CHECK-NEXT:    vmov.32 q6[3], r5
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r8
-; CHECK-NEXT:    vmov q4[3], q4[1], r3, r8
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r6, ne
 ; CHECK-NEXT:    mvn r8, #-2147483648
+; CHECK-NEXT:    vmov.32 q4[2], r6
+; CHECK-NEXT:    vmov.32 q4[3], r6
+; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    vbic q5, q0, q4
 ; CHECK-NEXT:    vand q4, q6, q4
 ; CHECK-NEXT:    vorr q4, q4, q5
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    subs.w r5, r5, r8
+; CHECK-NEXT:    sbcs r4, r4, #0
 ; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov.32 q5[0], r4
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    subs.w r5, r5, r8
-; CHECK-NEXT:    vmov.32 q5[1], r3
-; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    vmov r5, s12
 ; CHECK-NEXT:    sbcs r4, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    smull r6, r5, r6, r5
+; CHECK-NEXT:    vmov.32 q5[2], r4
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vbic q6, q1, q5
 ; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vorr q4, q4, q6
-; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    smull r4, r7, r4, r3
+; CHECK-NEXT:    smull r4, r7, r5, r4
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov q5[2], q5[0], r6, r4
-; CHECK-NEXT:    sbcs.w r3, r2, r7
-; CHECK-NEXT:    vmov q5[3], q5[1], r5, r7
+; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
+; CHECK-NEXT:    vmov.32 q3[0], r4
+; CHECK-NEXT:    sbcs.w r5, r10, r7
+; CHECK-NEXT:    vmov.32 q3[1], r7
+; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r5, #1
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov.32 q5[1], r5
+; CHECK-NEXT:    vmov r5, s10
+; CHECK-NEXT:    smull r6, r5, r6, r5
+; CHECK-NEXT:    asrl r6, r5, #31
+; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
+; CHECK-NEXT:    vmov.32 q3[2], r6
+; CHECK-NEXT:    sbcs.w r3, r10, r5
+; CHECK-NEXT:    vmov.32 q3[3], r5
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    rsbs.w r1, r6, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r2, r5
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    vbic q3, q0, q2
-; CHECK-NEXT:    vand q2, q5, q2
-; CHECK-NEXT:    vorr q2, q2, q3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    subs.w r3, r3, r8
-; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vbic q2, q0, q5
+; CHECK-NEXT:    vand q3, q3, q5
+; CHECK-NEXT:    vorr q2, q3, q2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov.32 q3[1], r5
 ; CHECK-NEXT:    subs.w r4, r4, r8
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    mov.w r3, #0
@@ -367,7 +379,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r5
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vbic q5, q1, q3
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vorr q2, q2, q5
@@ -388,25 +400,25 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:  .LBB1_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r1, [r12], #4
-; CHECK-NEXT:    ldr r4, [r9], #4
-; CHECK-NEXT:    smull r4, r1, r4, r1
-; CHECK-NEXT:    asrl r4, r1, #31
-; CHECK-NEXT:    subs r5, r3, r4
-; CHECK-NEXT:    sbcs.w r5, r0, r1
-; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    ldr r4, [r12], #4
+; CHECK-NEXT:    ldr r5, [r1], #4
+; CHECK-NEXT:    smull r4, r5, r5, r4
+; CHECK-NEXT:    asrl r4, r5, #31
+; CHECK-NEXT:    subs r6, r3, r4
+; CHECK-NEXT:    sbcs.w r6, r0, r5
+; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #1
-; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    csel r4, r4, r3, ne
-; CHECK-NEXT:    csel r1, r1, r0, ne
-; CHECK-NEXT:    subs r5, r4, r2
-; CHECK-NEXT:    sbcs r1, r1, #0
-; CHECK-NEXT:    csel r1, r4, r2, lt
-; CHECK-NEXT:    str r1, [r11], #4
+; CHECK-NEXT:    csel r5, r5, r0, ne
+; CHECK-NEXT:    subs r6, r4, r2
+; CHECK-NEXT:    sbcs r5, r5, #0
+; CHECK-NEXT:    csel r4, r4, r2, lt
+; CHECK-NEXT:    str r4, [r11], #4
 ; CHECK-NEXT:    le lr, .LBB1_7
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -503,8 +515,10 @@ for.body:                                         ; preds = %for.body.preheader2
 define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_4t_q31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
@@ -513,19 +527,19 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    beq.w .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r7, r3, #3
-; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    bic r7, r7, #3
 ; CHECK-NEXT:    adr r4, .LCPI2_1
+; CHECK-NEXT:    bic r7, r7, #3
+; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    subs r7, #4
-; CHECK-NEXT:    adr r5, .LCPI2_2
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [r5]
+; CHECK-NEXT:    adr r4, .LCPI2_2
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:    adr r6, .LCPI2_0
 ; CHECK-NEXT:    subs r7, r3, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    vldrw.u32 q3, [r4]
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    mvn r8, #-2147483648
@@ -549,59 +563,63 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
+; CHECK-NEXT:    vmov r7, s3
+; CHECK-NEXT:    rsbs.w r4, r6, #-2147483648
+; CHECK-NEXT:    vmov.32 q7[0], r6
+; CHECK-NEXT:    sbcs.w r4, r12, r5
+; CHECK-NEXT:    vmov.32 q7[1], r5
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    vmov.f32 s22, s21
-; CHECK-NEXT:    sbcs.w r7, r12, r5
-; CHECK-NEXT:    mov.w r7, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #1
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    vmov r7, s3
-; CHECK-NEXT:    csetm r10, ne
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov.32 q6[0], r4
+; CHECK-NEXT:    vmov.32 q6[1], r4
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov q7[2], q7[0], r4, r6
+; CHECK-NEXT:    vmov.32 q7[2], r4
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
-; CHECK-NEXT:    vmov q7[3], q7[1], r7, r5
+; CHECK-NEXT:    vmov.32 q7[3], r7
 ; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    vmov r7, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r10
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r10
-; CHECK-NEXT:    vbic q6, q2, q0
-; CHECK-NEXT:    vand q0, q7, q0
-; CHECK-NEXT:    vorr q6, q0, q6
+; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vbic q0, q2, q6
+; CHECK-NEXT:    vand q6, q7, q6
+; CHECK-NEXT:    vorr q6, q6, q0
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    vmov r5, s26
 ; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s27
 ; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r4, s26
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs.w r5, r5, r8
+; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    vmov r3, s27
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q0[2], r3
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s20
 ; CHECK-NEXT:    vbic q7, q3, q0
 ; CHECK-NEXT:    vand q0, q6, q0
 ; CHECK-NEXT:    vorr q6, q0, q7
 ; CHECK-NEXT:    smull r6, r5, r4, r3
-; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    vmov r4, s22
 ; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
 ; CHECK-NEXT:    sbcs.w r3, r12, r5
@@ -609,43 +627,49 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r10, ne
-; CHECK-NEXT:    smull r4, r7, r7, r4
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q7[0], r3
+; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov.32 q4[0], r6
+; CHECK-NEXT:    vmov.32 q4[1], r5
+; CHECK-NEXT:    smull r4, r7, r4, r3
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r6
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
-; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
+; CHECK-NEXT:    vmov.32 q4[3], r7
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r10
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r10
-; CHECK-NEXT:    vbic q4, q2, q0
-; CHECK-NEXT:    vand q0, q5, q0
-; CHECK-NEXT:    vorr q4, q0, q4
+; CHECK-NEXT:    vmov.32 q7[2], r3
+; CHECK-NEXT:    vmov.32 q7[3], r3
+; CHECK-NEXT:    vbic q0, q2, q7
+; CHECK-NEXT:    vand q4, q4, q7
+; CHECK-NEXT:    vorr q4, q4, q0
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    subs.w r4, r4, r8
-; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    vmov r4, s18
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    subs.w r5, r5, r8
+; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r3
-; CHECK-NEXT:    sbcs r4, r4, #0
-; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    subs.w r4, r4, r8
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #1
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csetm r4, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    vmov.32 q0[2], r3
 ; CHECK-NEXT:    vbic q5, q3, q0
 ; CHECK-NEXT:    vand q0, q4, q0
 ; CHECK-NEXT:    vorr q0, q0, q5
@@ -658,7 +682,8 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  .LBB2_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI2_0:
@@ -750,31 +775,33 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r7, [r0]
+; CHECK-NEXT:    ldrd r4, r9, [r0]
 ; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    ldrd r5, r10, [r1]
 ; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    umull r4, r5, r5, r4
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    subs.w r6, r4, #-1
-; CHECK-NEXT:    umull r6, r7, r10, r7
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    lsrl r6, r7, #31
-; CHECK-NEXT:    csetm r9, ne
-; CHECK-NEXT:    subs.w r5, r6, #-1
-; CHECK-NEXT:    vmov.32 q0[1], r9
-; CHECK-NEXT:    sbcs r5, r7, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r4
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q0[0], r5
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    umull r6, r5, r10, r9
+; CHECK-NEXT:    lsrl r6, r5, #31
+; CHECK-NEXT:    subs.w r7, r6, #-1
+; CHECK-NEXT:    vmov.32 q1[2], r6
+; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r9
+; CHECK-NEXT:    vmov.32 q0[2], r5
 ; CHECK-NEXT:    vand q1, q1, q0
 ; CHECK-NEXT:    vorn q0, q1, q0
 ; CHECK-NEXT:    vmov r4, s2
@@ -879,10 +906,8 @@ for.body:                                         ; preds = %for.body.preheader,
 define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: usatmul_4_q31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    cmp r3, #0
@@ -918,53 +943,57 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov r5, s17
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    lsrl r4, r5, #31
-; CHECK-NEXT:    vmov r7, s19
-; CHECK-NEXT:    subs.w r6, r4, #-1
 ; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    subs.w r6, r4, #-1
+; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    vmov r6, s18
 ; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r11, ne
-; CHECK-NEXT:    subs.w r5, r6, #-1
-; CHECK-NEXT:    sbcs r5, r7, #0
-; CHECK-NEXT:    vmov.32 q1[1], r11
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q1[0], r5
+; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    vmov r5, s19
+; CHECK-NEXT:    lsrl r6, r5, #31
+; CHECK-NEXT:    subs.w r7, r6, #-1
+; CHECK-NEXT:    vmov.32 q3[2], r6
+; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r4
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r11
+; CHECK-NEXT:    vmov.32 q1[2], r5
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vorn q1, q3, q1
 ; CHECK-NEXT:    vmullb.u32 q3, q2, q0
 ; CHECK-NEXT:    vmov r5, s13
 ; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    lsrl r4, r5, #31
-; CHECK-NEXT:    vmov r7, s15
 ; CHECK-NEXT:    subs.w r6, r4, #-1
+; CHECK-NEXT:    vmov.32 q2[0], r4
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    lsrl r6, r7, #31
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r11, ne
-; CHECK-NEXT:    subs.w r5, r6, #-1
-; CHECK-NEXT:    sbcs r5, r7, #0
-; CHECK-NEXT:    vmov.32 q0[1], r11
+; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    vmov.32 q0[0], r5
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    lsrl r6, r5, #31
+; CHECK-NEXT:    subs.w r7, r6, #-1
+; CHECK-NEXT:    vmov.32 q2[2], r6
+; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r6, r4
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r5, #1
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r11
+; CHECK-NEXT:    vmov.32 q0[2], r5
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vorn q0, q2, q0
 ; CHECK-NEXT:    vmov.f32 s1, s2
@@ -992,8 +1021,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    le lr, .LBB4_7
 ; CHECK-NEXT:  .LBB4_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
@@ -1563,12 +1591,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_8t_q15:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB9_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
@@ -1579,99 +1607,107 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    sub.w r12, r12, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
-; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q6, r5
-; CHECK-NEXT:    adds r5, #8
-; CHECK-NEXT:    vorr q5, q6, q0
-; CHECK-NEXT:    vorr q6, q6, q4
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:    adds r3, #8
+; CHECK-NEXT:    vorr q5, q0, q5
+; CHECK-NEXT:    vorr q0, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
-; CHECK-NEXT:    vpsel q7, q3, q2
-; CHECK-NEXT:    vcmp.u32 cs, q1, q6
-; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    vpsel q6, q3, q2
+; CHECK-NEXT:    vcmp.u32 cs, q1, q0
+; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vpsel q0, q3, q2
 ; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s29
+; CHECK-NEXT:    vmov r4, s25
 ; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s30
+; CHECK-NEXT:    vmov r4, s26
 ; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s31
+; CHECK-NEXT:    vmov r4, s27
 ; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    vmov r4, s1
 ; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s27
+; CHECK-NEXT:    vmov r4, s3
 ; CHECK-NEXT:    vmov.16 q5[7], r4
 ; CHECK-NEXT:    vpt.i16 ne, q5, zr
 ; CHECK-NEXT:    vldrht.u16 q6, [r0], #16
 ; CHECK-NEXT:    vmov.u16 r4, q6[0]
-; CHECK-NEXT:    vmov.u16 r3, q6[2]
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
-; CHECK-NEXT:    vmov.u16 r3, q6[1]
-; CHECK-NEXT:    vmov.u16 r4, q6[3]
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q7, [r1], #16
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[0]
+; CHECK-NEXT:    vmov.32 q5[0], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[1]
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[2]
+; CHECK-NEXT:    vmov.32 q5[2], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[3]
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov.u16 r4, q7[0]
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.u16 r4, q7[1]
+; CHECK-NEXT:    vmov.32 q0[1], r4
 ; CHECK-NEXT:    vmov.u16 r4, q7[2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[1]
+; CHECK-NEXT:    vmov.32 q0[2], r4
 ; CHECK-NEXT:    vmov.u16 r4, q7[3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r4, q6[6]
+; CHECK-NEXT:    vmov.32 q0[3], r4
 ; CHECK-NEXT:    vmullb.s16 q0, q0, q5
 ; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q5[0], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.16 q5[1], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.16 q5[2], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.16 q5[3], r3
-; CHECK-NEXT:    vmov.u16 r3, q6[4]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q6[5]
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q5[0], r4
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov.16 q5[1], r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov.16 q5[2], r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov.16 q5[3], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[4]
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[5]
+; CHECK-NEXT:    vmov.32 q0[1], r4
+; CHECK-NEXT:    vmov.u16 r4, q6[6]
+; CHECK-NEXT:    vmov.32 q0[2], r4
 ; CHECK-NEXT:    vmov.u16 r4, q6[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[4]
+; CHECK-NEXT:    vmov.32 q0[3], r4
+; CHECK-NEXT:    vmov.u16 r4, q7[4]
+; CHECK-NEXT:    vmov.32 q6[0], r4
+; CHECK-NEXT:    vmov.u16 r4, q7[5]
+; CHECK-NEXT:    vmov.32 q6[1], r4
 ; CHECK-NEXT:    vmov.u16 r4, q7[6]
-; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[5]
+; CHECK-NEXT:    vmov.32 q6[2], r4
 ; CHECK-NEXT:    vmov.u16 r4, q7[7]
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
+; CHECK-NEXT:    vmov.32 q6[3], r4
 ; CHECK-NEXT:    vmullb.s16 q0, q6, q0
 ; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q5[4], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.16 q5[5], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.16 q5[6], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.16 q5[7], r3
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q5[4], r4
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov.16 q5[5], r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov.16 q5[6], r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov.16 q5[7], r4
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q5, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB9_2
 ; CHECK-NEXT:  .LBB9_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI9_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index 0021ff342330..7313cb66c9c9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -34,13 +34,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: sadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov lr, s4
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r1, gt
 ; CHECK-NEXT:    cmp.w r2, #-1
@@ -49,49 +48,53 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    cset r12, eq
 ; CHECK-NEXT:    adds.w r1, r1, lr
+; CHECK-NEXT:    adcs r2, r0
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r0, gt
+; CHECK-NEXT:    cmp r3, r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    and.w r0, r0, r12
+; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    and r3, r0, #1
+; CHECK-NEXT:    cset r0, mi
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    cinv r0, r12, eq
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    asrne r1, r2, #31
+; CHECK-NEXT:    csel r0, r0, r2, ne
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    cmp.w r0, #-1
+; CHECK-NEXT:    cset r1, gt
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r3, gt
+; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    cset lr, eq
+; CHECK-NEXT:    adds r1, r1, r4
 ; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r2, gt
 ; CHECK-NEXT:    cmp r3, r2
-; CHECK-NEXT:    vmov r3, s7
 ; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    and.w r2, r2, r12
-; CHECK-NEXT:    ands r12, r2, #1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    and.w r2, r2, lr
+; CHECK-NEXT:    ands r2, r2, #1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r1, r0, #31
-; CHECK-NEXT:    cmp.w r3, #-1
-; CHECK-NEXT:    cset lr, gt
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r4, gt
-; CHECK-NEXT:    cmp r4, lr
-; CHECK-NEXT:    cset lr, eq
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    mvn r6, #-2147483648
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r3, gt
-; CHECK-NEXT:    cmp r4, r3
-; CHECK-NEXT:    cset r3, ne
-; CHECK-NEXT:    and.w r3, r3, lr
-; CHECK-NEXT:    ands r3, r3, #1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r5, r2, #31
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r1
+; CHECK-NEXT:    vmov.32 q2[2], r1
 ; CHECK-NEXT:    cset r1, mi
 ; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r6, eq
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    cinv r1, r12, eq
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r6, eq
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csel r1, r1, r2, ne
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %0 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -130,36 +133,34 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: uadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    adcs r1, r12, #0
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    movne.w r2, #-1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov r1, s3
-; CHECK-NEXT:    adcs lr, r12, #0
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    adcs r1, r12, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    adds r4, r4, r5
-; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    adcs r3, r12, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
 entry:
   %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -199,13 +200,12 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: ssub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov lr, s4
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r1, gt
 ; CHECK-NEXT:    cmp.w r2, #-1
@@ -214,49 +214,53 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    cset r12, ne
 ; CHECK-NEXT:    subs.w r1, r1, lr
+; CHECK-NEXT:    sbcs r2, r0
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r0, gt
+; CHECK-NEXT:    cmp r3, r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    and.w r0, r0, r12
+; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    and r3, r0, #1
+; CHECK-NEXT:    cset r0, mi
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    cinv r0, r12, eq
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    asrne r1, r2, #31
+; CHECK-NEXT:    csel r0, r0, r2, ne
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    cmp.w r0, #-1
+; CHECK-NEXT:    cset r1, gt
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    cset r3, gt
+; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    cset lr, ne
+; CHECK-NEXT:    subs r1, r4, r1
 ; CHECK-NEXT:    sbc.w r0, r2, r0
 ; CHECK-NEXT:    cmp.w r0, #-1
 ; CHECK-NEXT:    cset r2, gt
 ; CHECK-NEXT:    cmp r3, r2
-; CHECK-NEXT:    vmov r3, s7
 ; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    and.w r2, r2, r12
-; CHECK-NEXT:    ands r12, r2, #1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    and.w r2, r2, lr
+; CHECK-NEXT:    ands r2, r2, #1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r1, r0, #31
-; CHECK-NEXT:    cmp.w r3, #-1
-; CHECK-NEXT:    cset lr, gt
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r4, gt
-; CHECK-NEXT:    cmp r4, lr
-; CHECK-NEXT:    cset lr, ne
-; CHECK-NEXT:    subs r5, r6, r5
-; CHECK-NEXT:    sbcs r2, r3
-; CHECK-NEXT:    mvn r6, #-2147483648
-; CHECK-NEXT:    cmp.w r2, #-1
-; CHECK-NEXT:    cset r3, gt
-; CHECK-NEXT:    cmp r4, r3
-; CHECK-NEXT:    cset r3, ne
-; CHECK-NEXT:    and.w r3, r3, lr
-; CHECK-NEXT:    ands r3, r3, #1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r5, r2, #31
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r1
+; CHECK-NEXT:    vmov.32 q2[2], r1
 ; CHECK-NEXT:    cset r1, mi
 ; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r6, eq
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    cinv r1, r12, eq
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    cinv r1, r6, eq
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csel r1, r1, r2, ne
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %0 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -295,38 +299,36 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: usub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sbcs.w r0, r1, r0
 ; CHECK-NEXT:    adc r1, r12, #0
-; CHECK-NEXT:    rsbs.w lr, r1, #1
+; CHECK-NEXT:    rsbs.w r1, r1, #1
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    movne r0, #0
+; CHECK-NEXT:    movne r2, #0
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    sbcs.w r0, r1, r0
+; CHECK-NEXT:    adc r1, r12, #0
+; CHECK-NEXT:    rsbs.w r1, r1, #1
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r2, #0
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    sbcs r1, r3
-; CHECK-NEXT:    adc r3, r12, #0
-; CHECK-NEXT:    rsbs.w r3, r3, #1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #0
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r1, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
 entry:
   %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index 7d6ce1f1b49b..4514de457859 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -64,8 +64,9 @@ define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x
 ; CHECK-NEXT:    ldrb r2, [r1]
 ; CHECK-NEXT:    vmov.i32 q1, #0xff
 ; CHECK-NEXT:    ldrb r1, [r1, #1]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.32 q2[2], r1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    strb r2, [r0, r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll
index e0e6fefef98b..26f524d7aed8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -63,11 +63,14 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i32(<2 x i64> %m) {
 ; CHECK-LABEL: sext_v2i64_v2i64_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %shl = shl <2 x i64> %m, <i64 32, i64 32>
@@ -79,13 +82,15 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i35(<2 x i64> %m) {
 ; CHECK-LABEL: sext_v2i64_v2i64_v2i35:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sbfx r0, r0, #0, #3
-; CHECK-NEXT:    sbfx r1, r1, #0, #3
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    sbfx r0, r0, #0, #3
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -173,19 +178,23 @@ define arm_aapcs_vfpcc <8 x i32> @sext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.s16 q2, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -199,37 +208,45 @@ define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.s16 q4, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q2, q2
 ; CHECK-NEXT:    vmovlb.s8 q0, q3
 ; CHECK-NEXT:    vmovlb.s16 q3, q0
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -242,11 +259,14 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i32_v2i64(<2 x i32> %src) {
 ; CHECK-LABEL: sext_v2i32_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sext <2 x i32> %src to <2 x i64>
@@ -333,19 +353,23 @@ define arm_aapcs_vfpcc <8 x i32> @zext_v8i16_v8i32(<8 x i16> %src) {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.u16 q2, q1
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -359,35 +383,43 @@ define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vand q4, q1, q3
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
+; CHECK-NEXT:    vmov.32 q5[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vand q3, q5, q3
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
index 1db3ebe0f4c5..8243e0df2059 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll
@@ -38,12 +38,15 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qq_int64_t(<2 x i64> %src1, <2 x i64> %src
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    lsll r2, r1, r0
-; CHECK-NEXT:    vmov r12, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    lsll r0, r3, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    lsll r2, r1, r0
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = shl <2 x i64> %src1, %src2
@@ -87,21 +90,22 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: shru_qq_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, lr}
-; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    lsll r0, r5, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    lsll r2, r3, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r5
-; CHECK-NEXT:    pop {r5, pc}
+; CHECK-NEXT:    lsll r0, r1, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    lsll r0, r1, r2
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
 entry:
   %0 = lshr <2 x i64> %src1, %src2
   ret <2 x i64> %0
@@ -148,12 +152,15 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qq_int64_t(<2 x i64> %src1, <2 x i64> %sr
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    asrl r2, r1, r0
-; CHECK-NEXT:    vmov r12, s6
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    asrl r0, r3, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    asrl r2, r1, r0
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = ashr <2 x i64> %src1, %src2
@@ -196,12 +203,15 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qi_int64_t(<2 x i64> %src1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    lsll r0, r1, #4
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    lsll r2, r3, #4
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    lsll r0, r1, #4
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = shl <2 x i64> %src1, <i64 4, i64 4>
@@ -244,12 +254,15 @@ define arm_aapcs_vfpcc <2 x i64> @shru_qi_int64_t(<2 x i64> %src1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    lsrl r0, r1, #4
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    lsrl r2, r3, #4
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    lsrl r0, r1, #4
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = lshr <2 x i64> %src1, <i64 4, i64 4>
@@ -292,12 +305,15 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qi_int64_t(<2 x i64> %src1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    asrl r0, r1, #4
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    asrl r2, r3, #4
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    asrl r0, r1, #4
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = ashr <2 x i64> %src1, <i64 4, i64 4>
@@ -345,13 +361,16 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shl_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    lsll r12, r1, r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    lsll r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    lsll r2, r3, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    lsll r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -403,15 +422,18 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shru_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shru_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    rsb.w r12, r0, #0
+; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    lsll r2, r1, r12
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    lsll r0, r3, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    lsll r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    lsll r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -464,13 +486,16 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qr_int64_t(<2 x i64> %src1, i64 %src2) {
 ; CHECK-LABEL: shrs_qr_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    asrl r12, r1, r0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    asrl r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    asrl r2, r3, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    asrl r2, r1, r0
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index 087fa5d4baf0..dc04c5e75837 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -37,20 +37,22 @@ define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    adc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = add nsw <2 x i64> %src1, %src2
@@ -186,20 +188,22 @@ define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov r1, s7
 ; CHECK-NEXT:    subs.w lr, r3, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    sbc.w r12, r1, r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = sub nsw <2 x i64> %src2, %src1
@@ -348,9 +352,11 @@ define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    mla r0, r2, r0, lr
 ; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r12
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q0[2], r4
 ; CHECK-NEXT:    mla r1, r2, r3, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = mul nsw <2 x i64> %src1, %src2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index fa1be8ed7beb..37ca5a2f2020 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -59,20 +59,22 @@ define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK-FP-NEXT:    vmov d0, r0, r1
 ; CHECK-FP-NEXT:    add r0, sp, #8
 ; CHECK-FP-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-FP-NEXT:    vmov r1, s0
-; CHECK-FP-NEXT:    vmov r3, s4
-; CHECK-FP-NEXT:    vmov r0, s1
-; CHECK-FP-NEXT:    vmov r2, s5
+; CHECK-FP-NEXT:    vmov r1, s2
+; CHECK-FP-NEXT:    vmov r0, s3
+; CHECK-FP-NEXT:    vmov r3, s6
+; CHECK-FP-NEXT:    vmov r2, s7
 ; CHECK-FP-NEXT:    adds.w lr, r1, r3
-; CHECK-FP-NEXT:    vmov r3, s2
-; CHECK-FP-NEXT:    vmov r1, s6
+; CHECK-FP-NEXT:    vmov r3, s0
+; CHECK-FP-NEXT:    vmov r1, s4
 ; CHECK-FP-NEXT:    adc.w r12, r0, r2
-; CHECK-FP-NEXT:    vmov r2, s3
-; CHECK-FP-NEXT:    vmov r0, s7
+; CHECK-FP-NEXT:    vmov r2, s1
+; CHECK-FP-NEXT:    vmov r0, s5
 ; CHECK-FP-NEXT:    adds r1, r1, r3
-; CHECK-FP-NEXT:    vmov q0[2], q0[0], r1, lr
+; CHECK-FP-NEXT:    vmov.32 q0[0], r1
 ; CHECK-FP-NEXT:    adcs r0, r2
-; CHECK-FP-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-FP-NEXT:    vmov.32 q0[1], r0
+; CHECK-FP-NEXT:    vmov.32 q0[2], lr
+; CHECK-FP-NEXT:    vmov.32 q0[3], r12
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1
 ; CHECK-FP-NEXT:    pop {r7, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 7cc51dfa115e..cb82f9020d34 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -124,19 +124,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_s16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vmovlb.s16 q3, q3
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
 ; CHECK-NEXT:    vabs.s32 q3, q2
@@ -149,17 +152,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.s16 q1, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmovlb.s16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
 ; CHECK-NEXT:    vabs.s32 q0, q0
@@ -186,47 +194,47 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vabd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.f32 s8, s0
 ; CHECK-NEXT:    vmov.f32 s12, s4
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.f32 s16, s2
-; CHECK-NEXT:    vmov.f32 s20, s6
-; CHECK-NEXT:    vmov.f32 s18, s3
-; CHECK-NEXT:    vmov.f32 s22, s7
-; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT:    vmov r3, s22
 ; CHECK-NEXT:    add.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    eor.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s18
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    subs r1, r1, r3
-; CHECK-NEXT:    sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT:    add.w r1, r1, r2, asr #31
-; CHECK-NEXT:    eor.w r1, r1, r2, asr #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    add.w r0, r0, r1, asr #31
+; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -361,19 +369,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_u16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmovlb.u16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmovlb.u16 q2, q2
 ; CHECK-NEXT:    vmovlb.u16 q3, q3
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
 ; CHECK-NEXT:    vabs.s32 q3, q2
@@ -386,17 +397,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmovlb.u16 q1, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmovlb.u16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
 ; CHECK-NEXT:    vabs.s32 q0, q0
@@ -423,56 +439,59 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vabd_u32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s12, s0
+; CHECK-NEXT:    vmov.i64 q3, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s16, s0
 ; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vmov.f32 s14, s1
-; CHECK-NEXT:    vand q2, q2, q4
-; CHECK-NEXT:    vand q3, q3, q4
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s13
-; CHECK-NEXT:    vmov.f32 s20, s6
-; CHECK-NEXT:    vmov.f32 s22, s7
-; CHECK-NEXT:    vand q1, q5, q4
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vand q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s18, s1
+; CHECK-NEXT:    vand q2, q2, q3
+; CHECK-NEXT:    vand q4, q4, q3
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s19
 ; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    sbc.w r0, r1, r0
 ; CHECK-NEXT:    add.w r1, r2, r0, asr #31
 ; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vand q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s16, s2
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vand q0, q4, q3
 ; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sbc.w r1, r2, r1
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    add.w r0, r0, r1, asr #31
 ; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.32 q2[1], r12
 ; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sbc.w r0, r1, r0
 ; CHECK-NEXT:    add.w r1, r2, r0, asr #31
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    eor.w r12, r1, r0, asr #31
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    sbc.w r1, r2, r1
-; CHECK-NEXT:    add.w r0, r0, r1, asr #31
-; CHECK-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:    sbc.w r0, r1, r0
+; CHECK-NEXT:    add.w r1, r2, r0, asr #31
+; CHECK-NEXT:    eor.w r0, r1, r0, asr #31
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
@@ -599,8 +618,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 ; CHECK-LABEL: vabd_loop_s32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    mov.w lr, #256
@@ -632,31 +653,34 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    subs.w r9, r5, r7
 ; CHECK-NEXT:    asr.w r6, r5, #31
+; CHECK-NEXT:    vmov r5, s6
 ; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
+; CHECK-NEXT:    vmov.32 q1[0], r8
+; CHECK-NEXT:    vmov.32 q1[1], r9
 ; CHECK-NEXT:    and.w r6, r12, r6, asr #31
 ; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    bfi r4, r6, #4, #4
 ; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    subs.w r10, r6, r3
-; CHECK-NEXT:    asr.w r7, r6, #31
+; CHECK-NEXT:    asrs r7, r6, #31
+; CHECK-NEXT:    subs r6, r6, r3
 ; CHECK-NEXT:    sbc.w r3, r7, r3, asr #31
-; CHECK-NEXT:    vmov r7, s14
-; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov.32 q1[2], r6
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    asrs r5, r7, #31
-; CHECK-NEXT:    subs r7, r7, r6
-; CHECK-NEXT:    sbc.w r5, r5, r6, asr #31
-; CHECK-NEXT:    asrs r6, r5, #31
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov q1[2], q1[0], r10, r8
-; CHECK-NEXT:    vmov q1[3], q1[1], r7, r9
-; CHECK-NEXT:    and r3, r3, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r4, r3, #8, #4
-; CHECK-NEXT:    and.w r3, r12, r5, asr #31
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r4, r3, #12, #4
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    asrs r7, r3, #31
+; CHECK-NEXT:    subs r3, r3, r5
+; CHECK-NEXT:    sbc.w r5, r7, r5, asr #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    asrs r7, r5, #31
+; CHECK-NEXT:    and.w r5, r12, r5, asr #31
+; CHECK-NEXT:    vmov.32 q2[2], r7
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov r7, s8
+; CHECK-NEXT:    and r7, r7, #1
+; CHECK-NEXT:    rsbs r7, r7, #0
+; CHECK-NEXT:    bfi r4, r7, #8, #4
+; CHECK-NEXT:    bfi r4, r5, #12, #4
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vsubt.i32 q1, q0, q1
@@ -664,7 +688,8 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    le lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   br label %vector.body
 
@@ -809,10 +834,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 ; CHECK-LABEL: vabd_loop_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    mov.w lr, #256
@@ -859,25 +882,28 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    bfi r4, r3, #4, #4
 ; CHECK-NEXT:    vmov r3, s9
 ; CHECK-NEXT:    subs.w r10, r5, r7
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    vmov r7, s15
+; CHECK-NEXT:    vmov r7, s10
+; CHECK-NEXT:    vmov r5, s14
 ; CHECK-NEXT:    sbc.w r3, r6, r3
-; CHECK-NEXT:    vmov r6, s11
-; CHECK-NEXT:    asr.w r11, r3, #31
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    subs r3, r3, r5
-; CHECK-NEXT:    sbc.w r5, r7, r6
-; CHECK-NEXT:    asrs r6, r5, #31
-; CHECK-NEXT:    and.w r5, r12, r5, asr #31
-; CHECK-NEXT:    vmov q2[2], q2[0], r6, r11
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov r6, s8
-; CHECK-NEXT:    vmov q2[2], q2[0], r10, r8
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r9
+; CHECK-NEXT:    vmov r6, s15
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    vmov.32 q2[0], r8
+; CHECK-NEXT:    vmov.32 q2[1], r9
+; CHECK-NEXT:    vmov.32 q2[2], r10
+; CHECK-NEXT:    subs r5, r5, r7
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    sbc.w r3, r6, r3
+; CHECK-NEXT:    asrs r6, r3, #31
+; CHECK-NEXT:    and.w r3, r12, r3, asr #31
+; CHECK-NEXT:    vmov.32 q4[2], r6
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov r6, s16
 ; CHECK-NEXT:    and r6, r6, #1
 ; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    bfi r4, r6, #8, #4
-; CHECK-NEXT:    bfi r4, r5, #12, #4
+; CHECK-NEXT:    bfi r4, r3, #12, #4
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vsubt.i32 q2, q1, q2
@@ -885,8 +911,7 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    le lr, .LBB11_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
   br label %vector.body
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
index 56f9f56ed09e..a40beb4b0eba 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
@@ -367,31 +367,36 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q1, q3, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vbic q0, q3, q4
+; CHECK-NEXT:    vand q1, q2, q4
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, %srcb
@@ -402,31 +407,36 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: vcmp_eq_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    eors r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q1, q3, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vbic q0, q3, q4
+; CHECK-NEXT:    vand q1, q2, q4
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, %srcb
@@ -437,76 +447,84 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q0, q2, q0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    subs r1, r0, r2
-; CHECK-NEXT:    asr.w r12, r0, #31
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    subs.w r1, lr, r2
+; CHECK-NEXT:    asr.w r12, lr, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    csetm lr, ne
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    subs r4, r2, r1
-; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    subs r0, r1, r2
+; CHECK-NEXT:    asr.w r12, r1, #31
+; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q1, q5, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
index 818b65db9e87..06361d952120 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
@@ -438,22 +438,24 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -471,22 +473,24 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -499,76 +503,84 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q0, q2, q0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    subs r1, r0, r2
-; CHECK-NEXT:    asr.w r12, r0, #31
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    subs.w r1, lr, r2
+; CHECK-NEXT:    asr.w r12, lr, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    csetm lr, ne
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    subs r4, r2, r1
-; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    subs r0, r1, r2
+; CHECK-NEXT:    asr.w r12, r1, #31
+; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q1, q5, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer
@@ -1019,22 +1031,24 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -1052,22 +1066,24 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2
 ; CHECK-NEXT:    eors r2, r1
 ; CHECK-NEXT:    eors r3, r0
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    eors r1, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    eors r0, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %i = insertelement <2 x i64> undef, i64 %src2, i32 0
@@ -1080,76 +1096,84 @@ entry:
 define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: vcmp_r_multi_v2i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q0, q2, q0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    subs r1, r0, r2
-; CHECK-NEXT:    asr.w r12, r0, #31
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    subs.w r1, lr, r2
+; CHECK-NEXT:    asr.w r12, lr, #31
 ; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    csetm lr, ne
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    subs r4, r2, r1
-; CHECK-NEXT:    sbcs.w r1, r12, r1, asr #31
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    subs r0, r1, r2
+; CHECK-NEXT:    asr.w r12, r1, #31
+; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, lr
 ; CHECK-NEXT:    tst.w r0, #1
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, lr
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q1, q5, q4
 ; CHECK-NEXT:    vand q1, q3, q1
 ; CHECK-NEXT:    vbic q0, q0, q1
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
   %a4 = icmp eq <2 x i64> %a, zeroinitializer
   %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
   %a6 = icmp ne <2 x i32> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
index 759f4ddcac4d..e9b717494234 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
@@ -363,21 +363,23 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer
@@ -390,21 +392,23 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer
@@ -777,21 +781,23 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> zeroinitializer, %src
@@ -804,21 +810,23 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vbic q2, q2, q0
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    vorr q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %src, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
index 953dafe1228b..e408bc46b47a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll
@@ -4,8 +4,10 @@
 define arm_aapcs_vfpcc <4 x i32> @vcreate_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: vcreate_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    bx lr
 entry:
   %conv = zext i32 %a to i64
@@ -25,8 +27,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0123(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -39,8 +43,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_3210(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_3210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -53,8 +59,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0213(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -67,7 +75,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_0220(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_0220:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q0[2], r2
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -80,8 +89,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_321(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_321:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.32 q0[1], r2
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -94,7 +104,8 @@ define arm_aapcs_vfpcc <4 x i32> @insert_310(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_310:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -106,7 +117,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_320(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_320:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -119,7 +131,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_31(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 3
@@ -152,8 +165,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_210(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_210:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 2
@@ -165,7 +179,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @insert_20(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: insert_20:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %v1 = insertelement <4 x i32> undef, i32 %a, i32 2
@@ -230,26 +245,28 @@ entry:
 define hidden <8 x i16> @create_i16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d, i16 zeroext %a2, i16 zeroext %b2, i16 zeroext %c2, i16 zeroext %d2) local_unnamed_addr #0 {
 ; CHECK-LABEL: create_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r5, r6, r7, lr}
-; CHECK-NEXT:    push {r5, r6, r7, lr}
-; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    lsll r2, r7, #16
 ; CHECK-NEXT:    orr.w r0, r1, r0, lsl #16
-; CHECK-NEXT:    orr.w r12, r2, r3
-; CHECK-NEXT:    ldr r2, [sp, #24]
-; CHECK-NEXT:    ldr r3, [sp, #28]
-; CHECK-NEXT:    orrs r0, r7
 ; CHECK-NEXT:    lsll r2, r5, #16
-; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
-; CHECK-NEXT:    ldrd r1, r2, [sp, #16]
-; CHECK-NEXT:    orr.w r1, r2, r1, lsl #16
-; CHECK-NEXT:    orrs r1, r5
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    ldrd lr, r4, [sp, #16]
+; CHECK-NEXT:    orr.w r1, r2, r3
+; CHECK-NEXT:    ldr.w r12, [sp, #24]
+; CHECK-NEXT:    orrs r0, r5
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldr r0, [sp, #28]
+; CHECK-NEXT:    lsll r12, r7, #16
+; CHECK-NEXT:    orr.w r4, r4, lr, lsl #16
+; CHECK-NEXT:    orr.w r0, r0, r12
+; CHECK-NEXT:    orrs r7, r4
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r7
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    pop {r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %conv = zext i16 %a to i64
   %shl = shl nuw i64 %conv, 48
@@ -308,59 +325,59 @@ entry:
 define hidden <16 x i8> @create_i8(i8 zeroext %a1, i8 zeroext %b1, i8 zeroext %c1, i8 zeroext %d1, i8 zeroext %a2, i8 zeroext %b2, i8 zeroext %c2, i8 zeroext %d2, i8 zeroext %a3, i8 zeroext %b3, i8 zeroext %c3, i8 zeroext %d3, i8 zeroext %a4, i8 zeroext %b4, i8 zeroext %c4, i8 zeroext %d4) local_unnamed_addr #0 {
 ; CHECK-LABEL: create_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT:    ldr r4, [sp, #36]
+; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    ldr.w r12, [sp, #28]
 ; CHECK-NEXT:    mov.w r11, #0
-; CHECK-NEXT:    ldr r6, [sp, #32]
+; CHECK-NEXT:    ldr r4, [sp, #24]
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    lsll r12, r11, #16
+; CHECK-NEXT:    lsls r1, r1, #16
+; CHECK-NEXT:    lsll r4, r5, #24
+; CHECK-NEXT:    orr.w r0, r1, r0, lsl #22
+; CHECK-NEXT:    orr.w r12, r12, r4
+; CHECK-NEXT:    ldr r4, [sp, #32]
 ; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    lsll r4, r11, #16
-; CHECK-NEXT:    mov lr, r1
-; CHECK-NEXT:    lsll r6, r7, #24
-; CHECK-NEXT:    mov r12, r3
-; CHECK-NEXT:    orr.w r1, r6, r4
-; CHECK-NEXT:    ldr r4, [sp, #40]
+; CHECK-NEXT:    orr.w r0, r0, r2, lsl #8
+; CHECK-NEXT:    lsll r4, r7, #8
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    orr.w r12, r12, r4
+; CHECK-NEXT:    ldr r4, [sp, #36]
+; CHECK-NEXT:    orrs r0, r5
+; CHECK-NEXT:    ldr r2, [sp, #56]
+; CHECK-NEXT:    orr.w r0, r0, r11
+; CHECK-NEXT:    orr.w r4, r4, r12
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    orrs r0, r7
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    ldr r0, [sp, #60]
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    ldr r6, [sp, #68]
-; CHECK-NEXT:    lsll r4, r3, #8
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    orrs r1, r4
-; CHECK-NEXT:    ldr r4, [sp, #44]
-; CHECK-NEXT:    lsll r6, r5, #16
+; CHECK-NEXT:    lsll r0, r1, #16
+; CHECK-NEXT:    lsll r2, r3, #24
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    ldr r2, [sp, #64]
 ; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    orr.w r8, r1, r4
-; CHECK-NEXT:    ldr r4, [sp, #64]
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    lsll r4, r1, #24
-; CHECK-NEXT:    orrs r4, r6
-; CHECK-NEXT:    ldr r6, [sp, #72]
-; CHECK-NEXT:    lsll r6, r9, #8
-; CHECK-NEXT:    orrs r4, r6
-; CHECK-NEXT:    ldr r6, [sp, #76]
-; CHECK-NEXT:    orrs r4, r6
-; CHECK-NEXT:    lsl.w r6, lr, #16
-; CHECK-NEXT:    orr.w r0, r6, r0, lsl #22
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r8
+; CHECK-NEXT:    lsll r2, r9, #8
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    ldr r2, [sp, #68]
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    ldr r2, [sp, #40]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    ldr r0, [sp, #44]
+; CHECK-NEXT:    lsls r0, r0, #16
+; CHECK-NEXT:    orr.w r0, r0, r2, lsl #22
+; CHECK-NEXT:    ldr r2, [sp, #48]
 ; CHECK-NEXT:    orr.w r0, r0, r2, lsl #8
 ; CHECK-NEXT:    ldr r2, [sp, #52]
-; CHECK-NEXT:    add r0, r12
-; CHECK-NEXT:    orrs r0, r7
-; CHECK-NEXT:    orr.w r0, r0, r11
-; CHECK-NEXT:    lsls r2, r2, #16
+; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    orrs r0, r3
-; CHECK-NEXT:    ldr r3, [sp, #48]
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #22
-; CHECK-NEXT:    ldr r3, [sp, #56]
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #8
-; CHECK-NEXT:    ldr r3, [sp, #60]
-; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    orrs r1, r5
-; CHECK-NEXT:    orr.w r1, r1, r9
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    orr.w r0, r0, r9
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
 entry:
   %conv = zext i8 %a1 to i64
   %shl = shl nuw nsw i64 %conv, 54

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 5fa04f8a3477..831ca0499333 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -44,15 +44,17 @@ define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_int32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s8, s1
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s3
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s1
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s8, s3
 ; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-MVE-NEXT:    vmov.32 q0[0], r0
+; CHECK-MVE-NEXT:    vmov r0, s6
+; CHECK-MVE-NEXT:    vmov.32 q0[1], r0
+; CHECK-MVE-NEXT:    vmov r0, s10
+; CHECK-MVE-NEXT:    vmov.32 q0[2], r0
 ; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
-; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-MVE-NEXT:    vmov.32 q0[3], r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_int32_float:
@@ -68,15 +70,17 @@ define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_uint32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s0
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s8, s1
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s3
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s1
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s2
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s8, s3
 ; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-MVE-NEXT:    vmov.32 q0[0], r0
+; CHECK-MVE-NEXT:    vmov r0, s6
+; CHECK-MVE-NEXT:    vmov.32 q0[1], r0
+; CHECK-MVE-NEXT:    vmov r0, s10
+; CHECK-MVE-NEXT:    vmov.32 q0[2], r0
 ; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
-; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-MVE-NEXT:    vmov.32 q0[3], r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_uint32_float:
@@ -345,21 +349,24 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @foo_int64_float(<2 x double> %src) {
 ; CHECK-LABEL: foo_int64_float:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r2, r3, d9
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = fptosi <2 x double> %src to <2 x i64>
   ret <2 x i64> %out
@@ -368,21 +375,24 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @foo_uint64_float(<2 x double> %src) {
 ; CHECK-LABEL: foo_uint64_float:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2ulz
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r2, r3, d9
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_d2ulz
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = fptoui <2 x double> %src to <2 x i64>
   ret <2 x i64> %out

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index 75147225afb0..bce76f037a78 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -38,8 +38,10 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vdup_i64(i64 %src) {
 ; CHECK-LABEL: vdup_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r1
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <2 x i64> undef, i64 %src, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index d64a4c9c87d6..df2cb4361f2f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -130,36 +130,40 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-LABEL: add_v8i16_v8i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.32 q3[2], r1
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r1
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s15
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -179,56 +183,62 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ; CHECK-LABEL: add_v8i16_v8i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    sxth r1, r1
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[3]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
 ; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    sxth r3, r3
 ; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    asrs r3, r1, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    sxth r3, r3
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    adc.w r0, r0, r3, asr #31
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r0, r2, asr #31
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    sxth r3, r0
-; CHECK-NEXT:    adds r0, r1, r3
-; CHECK-NEXT:    adc.w r1, r2, r3, asr #31
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -385,35 +395,39 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.32 q3[2], r1
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r1
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vmov r1, s12
 ; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    vmov r1, s15
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -422,10 +436,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -434,10 +449,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -446,10 +462,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -458,11 +475,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -482,116 +500,130 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    adc.w r12, r0, r1, asr #31
+; CHECK-NEXT:    adc.w r12, r1, r0, asr #31
 ; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    asrs r3, r1, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r12, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[9]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds.w r12, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    adc.w r12, r0, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r0, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    adc.w r0, r0, r3, asr #31
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r0, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    sxtb r3, r0
-; CHECK-NEXT:    adds r0, r1, r3
-; CHECK-NEXT:    adc.w r1, r2, r3, asr #31
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -797,36 +829,40 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add.w r12, r3, r2
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[3]
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add r12, r2
+; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r4, r12, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
 ; CHECK-NEXT:    adc.w r12, r2, lr
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r3, r3, r4
@@ -851,58 +887,64 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
 ; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    adc.w r12, r12, r3, asr #31
-; CHECK-NEXT:    vmov.u16 r3, q0[3]
-; CHECK-NEXT:    sxth r3, r3
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u16 r4, q0[4]
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    adc.w r12, r12, r3, asr #31
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov.32 q1[2], r2
 ; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
+; CHECK-NEXT:    vmov.32 q1[3], r3
 ; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adds.w r4, r4, lr
 ; CHECK-NEXT:    adc.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov.u16 r4, q0[4]
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
+; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u16 r4, q0[5]
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r4, asr #31
 ; CHECK-NEXT:    vmov.u16 r4, q0[6]
-; CHECK-NEXT:    adc.w r2, r12, r2, asr #31
 ; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adc.w r2, r2, r4, asr #31
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
 ; CHECK-NEXT:    vmov.u16 r4, q0[7]
 ; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adc.w r2, r2, r4, asr #31
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -1077,35 +1119,39 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    add.w r12, r3, r2
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov lr, s15
-; CHECK-NEXT:    add r12, r2
+; CHECK-NEXT:    add.w r12, r2, r3
+; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r4, r12, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
 ; CHECK-NEXT:    adc.w r12, r2, lr
 ; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -1114,55 +1160,59 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    adds.w lr, r3, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
 ; CHECK-NEXT:    adc.w r3, r12, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
+; CHECK-NEXT:    vmov.32 q2[2], r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adds.w r12, lr, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    vmov.32 q2[2], r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adds.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    vmov.32 q2[2], r4
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adds.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w lr, r4, r2
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s11
+; CHECK-NEXT:    adds.w r12, r12, r2
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[15]
+; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, pc}
@@ -1179,118 +1229,132 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmov.u8 r3, q0[1]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    vmov.32 q1[3], r3
 ; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    adds.w lr, lr, r2
+; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    adc.w r12, r12, r3, asr #31
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r3, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r2
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    adds.w lr, r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    adc.w r12, r12, r3, asr #31
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov.32 q1[2], r2
 ; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
+; CHECK-NEXT:    vmov.32 q1[3], r3
 ; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adds.w r4, r4, lr
 ; CHECK-NEXT:    adc.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    asrs r4, r4, #31
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
+; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
 ; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
 ; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    adc.w r12, r12, r4, asr #31
 ; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    adc.w r12, r12, r2, asr #31
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
+; CHECK-NEXT:    vmov.32 q1[0], r4
 ; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    asrs r3, r2, #31
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r4, asr #31
 ; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    adc.w r2, r12, r2, asr #31
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adc.w r2, r2, r4, asr #31
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
 ; CHECK-NEXT:    vmov.u8 r4, q0[15]
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    adc.w r2, r2, r4, asr #31
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r4, asr #31
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 628e0d6ec64c..e59fb0bb1ef4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -47,19 +47,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -79,24 +81,28 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %b) {
 ; CHECK-LABEL: add_v2i32_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vand q0, q2, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -196,23 +202,28 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.u16 r1, q2[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.u16 r1, q2[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vmrs r0, p0
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.32 q3[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q4[2], r1
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r1, s15
@@ -222,64 +233,75 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r0, r0, #12, #1
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r0, r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r3, q0[3]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q4[2], r0
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r0, s13
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[6]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[5]
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r1
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    ubfx r0, r1, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vand q0, q3, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    adcs r1, r2
@@ -304,48 +326,58 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r0, p0
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
 ; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT:    vmov.32 q3[0], r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov.32 q3[3], r1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r0, r0, #0
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r0, s9
@@ -356,51 +388,61 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-NEXT:    adds.w r12, r1, r0
 ; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.u16 r3, q1[6]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.u16 r3, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r3
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[5]
-; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r0
-; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q2[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -425,18 +467,20 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -456,28 +500,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) {
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q2, #0xffff
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vand q2, q1, q2
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    sxth r1, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -662,23 +710,28 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vmov.u16 r0, q5[0]
-; CHECK-NEXT:    vmov.u16 r1, q5[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[1]
-; CHECK-NEXT:    vmov.u16 r1, q5[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vmrs r0, p0
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
+; CHECK-NEXT:    vmov.32 q6[0], r1
+; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    vmov.32 q6[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
-; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
+; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q7[2], r1
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r1, s27
@@ -688,64 +741,75 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov r2, s26
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r0, r0, #12, #1
+; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q6[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r0
+; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q7[2], r0
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r0, s25
 ; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    adcs r0, r1
-; CHECK-NEXT:    vmov r1, s27
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q5[4]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
-; CHECK-NEXT:    vmov.u16 r1, q5[5]
-; CHECK-NEXT:    vmov.u16 r3, q5[7]
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r1
+; CHECK-NEXT:    vmov r3, s27
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    adds.w r12, r2, r0
+; CHECK-NEXT:    vmov.u16 r2, q5[4]
+; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[5]
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmrs r1, p0
-; CHECK-NEXT:    and r3, r1, #1
-; CHECK-NEXT:    ubfx r0, r1, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r0
+; CHECK-NEXT:    vmov.32 q6[2], r3
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    adds r2, r2, r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adc.w r12, r12, r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r12, r3
-; CHECK-NEXT:    ubfx r3, r1, #8, #1
-; CHECK-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s23
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r3
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q6[2], r2
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r1, s21
+; CHECK-NEXT:    vmov r2, s21
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    vmov r3, s22
 ; CHECK-NEXT:    adcs r1, r2
@@ -771,41 +835,49 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r3, q2[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.u16 r3, q2[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    vmov.32 q4[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r0
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r1, s15
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.32 q4[2], r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
@@ -815,44 +887,52 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s15
 ; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[6]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vand q0, q3, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -900,48 +980,58 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q4, q2, q1
 ; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vmov.u16 r1, q4[2]
-; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
+; CHECK-NEXT:    vmov.32 q5[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.u16 r1, q4[3]
-; CHECK-NEXT:    vmov q5[3], q5[1], r1, r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.32 q5[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmrs r0, p0
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r1
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r1
+; CHECK-NEXT:    vmov.32 q5[0], r1
+; CHECK-NEXT:    vmov.32 q5[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q5[2], r1
+; CHECK-NEXT:    vmov.32 q5[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
+; CHECK-NEXT:    vmov.32 q6[0], r1
 ; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
+; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov.32 q6[3], r1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov r1, s22
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov r12, s23
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds r1, r1, r3
+; CHECK-NEXT:    vmov r3, s21
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r0, r0, #0
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r0
+; CHECK-NEXT:    vmov.32 q6[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r0
+; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q6[3], r0
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r0, s21
@@ -952,50 +1042,60 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    adds.w r12, r1, r0
 ; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q4[4]
-; CHECK-NEXT:    vmov.u16 r3, q4[6]
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[5]
-; CHECK-NEXT:    vmov.u16 r3, q4[7]
-; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
+; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q5[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r0
-; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q5[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r3, r0
+; CHECK-NEXT:    vmov.32 q5[3], r3
 ; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r3, s17
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q4[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
+; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vand q4, q5, q4
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vmov r2, s17
@@ -1024,50 +1124,60 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q1, q2, q1
 ; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.u16 r3, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.u16 r3, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
-; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r0
+; CHECK-NEXT:    vmov.32 q3[3], r3
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
@@ -1076,53 +1186,63 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    adds.w r12, r0, r3
-; CHECK-NEXT:    vmov.u16 r3, q1[6]
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.u16 r3, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vmov.32 q1[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[12]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r0
-; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q2[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vand q1, q2, q1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    adds.w r3, r3, r12
-; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    adds.w r12, r12, r0
 ; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r1, r1, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r3
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
@@ -1148,18 +1268,20 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -1179,28 +1301,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) {
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vand q2, q1, q2
+; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1221,19 +1347,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1299,19 +1427,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1336,24 +1466,28 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %b,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vand q0, q2, q0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1463,23 +1597,28 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r3, q2[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.u16 r3, q2[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsb.w r12, r3, #0
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r12
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r12
-; CHECK-NEXT:    vmov.u16 r12, q0[0]
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    vmov.32 q4[0], r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r12
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r12, s15
@@ -1489,61 +1628,72 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    add lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.u16 r3, q0[3]
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.32 q4[2], r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adds.w r4, lr, r3
 ; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    adc.w lr, r12, r2
 ; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[6]
-; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    adds.w r12, r4, r3
+; CHECK-NEXT:    adc.w lr, lr, r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.32 q2[1], r4
 ; CHECK-NEXT:    ubfx r4, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov.u16 r4, q0[4]
+; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[2], r4
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds.w lr, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adds.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q2[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vand q0, q3, q1
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
@@ -1576,113 +1726,133 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b,
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.u16 r3, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.u16 r3, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r3, r12, #1
-; CHECK-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    sxth r2, r2
 ; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[3], r3
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov lr, s11
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    adds r5, r4, r2
-; CHECK-NEXT:    ubfx r4, r12, #8, #1
-; CHECK-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    vmov r12, s11
+; CHECK-NEXT:    vmov r5, s9
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q2[0], r3
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    adc.w r12, r12, r5
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.u16 r4, q0[3]
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adcs r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds.w r12, r5, r2
-; CHECK-NEXT:    vmov.u16 r5, q1[4]
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.u16 r4, q1[6]
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
-; CHECK-NEXT:    vmov.u16 r5, q1[5]
-; CHECK-NEXT:    vmov.u16 r4, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov r5, s11
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    adc.w r3, r2, r5
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r5, p0
-; CHECK-NEXT:    and r4, r5, #1
-; CHECK-NEXT:    ubfx r2, r5, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov.u16 r4, q0[5]
-; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q1[0], r5
+; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    ubfx r5, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov.32 q1[3], r5
+; CHECK-NEXT:    vmov.u16 r5, q0[4]
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov.u16 r5, q0[5]
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    vmov.32 q2[2], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q2[3], r5
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    adcs r5, r3
 ; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #8, #1
-; CHECK-NEXT:    ubfx r5, r5, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
-; CHECK-NEXT:    vmov.u16 r5, q0[6]
-; CHECK-NEXT:    vmov.u16 r4, q0[7]
-; CHECK-NEXT:    sxth r5, r5
-; CHECK-NEXT:    sxth r4, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -1700,18 +1870,20 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orr.w r12, r3, r2
@@ -1736,28 +1908,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q2, #0xffff
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vand q2, q1, q2
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1929,8 +2105,8 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
@@ -1956,23 +2132,28 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q5[0]
-; CHECK-NEXT:    vmov.u16 r3, q5[2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[1]
-; CHECK-NEXT:    vmov.u16 r3, q5[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[2]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[3]
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsb.w r12, r3, #0
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r12
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r12
-; CHECK-NEXT:    vmov.u8 r12, q0[0]
+; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    vmov.32 q7[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r12
+; CHECK-NEXT:    vmov.32 q7[2], r3
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r12, s27
@@ -1982,70 +2163,80 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    add lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov.u8 r3, q0[3]
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
+; CHECK-NEXT:    vmov.32 q7[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.32 q7[2], r2
 ; CHECK-NEXT:    vand q7, q7, q1
 ; CHECK-NEXT:    vand q6, q7, q6
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    adds.w lr, lr, r3
+; CHECK-NEXT:    adds.w r4, lr, r3
 ; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    adc.w lr, r12, r2
 ; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    adds.w r12, r4, r3
+; CHECK-NEXT:    adc.w lr, lr, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.u16 r3, q5[7]
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
+; CHECK-NEXT:    and r4, r2, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q5[0], r4
+; CHECK-NEXT:    vmov.32 q5[1], r4
 ; CHECK-NEXT:    ubfx r4, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    vmov.32 q5[2], r4
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vmov.32 q6[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
+; CHECK-NEXT:    vmov.32 q6[2], r4
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds.w lr, lr, r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    adds.w r12, r12, r3
 ; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r4, r4, lr
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, lr, r3
 ; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q5[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q6[2], r2
 ; CHECK-NEXT:    vand q6, q6, q1
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds.w lr, r4, r3
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    adc.w r4, r12, r2
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    adds.w r12, lr, r3
-; CHECK-NEXT:    adc.w lr, r4, r2
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    adc.w lr, r12, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -2062,104 +2253,121 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    adc.w lr, lr, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r4, q2[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.u16 r4, q2[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r4, r2, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r4
-; CHECK-NEXT:    vmov.u8 r3, q0[8]
+; CHECK-NEXT:    vmov.32 q3[0], r4
+; CHECK-NEXT:    vmov.32 q3[1], r4
+; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov.32 q3[3], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    vmov.32 q4[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    adds.w r5, r12, r4
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r4, s13
+; CHECK-NEXT:    adds.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adc.w lr, lr, r4
 ; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    adds.w r4, r4, r12
 ; CHECK-NEXT:    adc.w r12, lr, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    ubfx r4, r2, #8, #1
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r4
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r2
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.32 q4[2], r2
 ; CHECK-NEXT:    vand q4, q4, q1
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov.u16 r4, q2[6]
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q2[5]
-; CHECK-NEXT:    vmov.u16 r4, q2[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s15
+; CHECK-NEXT:    adc.w lr, r12, r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    adds.w r12, r3, r2
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    adc.w lr, lr, r4
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    vmrs r3, p0
-; CHECK-NEXT:    and r4, r3, #1
-; CHECK-NEXT:    ubfx r2, r3, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r4, r2, #1
 ; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    vmov.32 q2[0], r4
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q2[2], r4
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
+; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
+; CHECK-NEXT:    vmov.32 q3[2], r4
 ; CHECK-NEXT:    vand q3, q3, q1
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    adds r2, r2, r5
-; CHECK-NEXT:    adc.w r5, r12, r4
-; CHECK-NEXT:    ubfx r4, r3, #8, #1
-; CHECK-NEXT:    ubfx r3, r3, #12, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    adds.w r12, r12, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adc.w lr, lr, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, lr, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vand q0, q3, q1
 ; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -2199,239 +2407,279 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q4, q2, q1
 ; CHECK-NEXT:    vmov.u16 r2, q4[0]
-; CHECK-NEXT:    vmov.u16 r3, q4[2]
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q4[1]
-; CHECK-NEXT:    vmov.u16 r3, q4[3]
-; CHECK-NEXT:    vmov q5[3], q5[1], r3, r2
+; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[2]
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[3]
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r3, r12, #1
-; CHECK-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
-; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q6[2], r3
 ; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
+; CHECK-NEXT:    vmov.32 q6[3], r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    vmov r3, s22
 ; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    adds r5, r4, r2
-; CHECK-NEXT:    ubfx r4, r12, #8, #1
-; CHECK-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    vmov r12, s23
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q5[0], r3
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r4
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r4
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    adc.w r12, r12, r5
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov.u8 r4, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q6[2], q6[0], r4, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    adcs r3, r2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adds.w r12, r5, r2
-; CHECK-NEXT:    vmov.u16 r5, q4[4]
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov.u16 r4, q4[6]
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
-; CHECK-NEXT:    vmov.u16 r5, q4[5]
-; CHECK-NEXT:    vmov.u16 r4, q4[7]
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r5
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r5, s23
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    adc.w r3, r2, r5
+; CHECK-NEXT:    vmov.u16 r2, q4[4]
+; CHECK-NEXT:    vmov.32 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[5]
+; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[6]
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.32 q5[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmrs r5, p0
-; CHECK-NEXT:    and r4, r5, #1
-; CHECK-NEXT:    ubfx r2, r5, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r2
-; CHECK-NEXT:    vand q4, q5, q4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #8, #1
-; CHECK-NEXT:    ubfx r5, r5, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
-; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[6]
-; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q4[0], r5
+; CHECK-NEXT:    vmov.32 q4[1], r5
+; CHECK-NEXT:    ubfx r5, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q4[2], r5
+; CHECK-NEXT:    vmov.32 q4[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q5[1], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r5
+; CHECK-NEXT:    vmov.32 q5[3], r5
 ; CHECK-NEXT:    vand q4, q5, q4
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vmov r5, s17
-; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adds.w r12, r12, r4
 ; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adcs r5, r3
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q5[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    vand q4, q5, q4
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov r5, s19
-; CHECK-NEXT:    adds.w r12, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov.u8 r5, q3[8]
-; CHECK-NEXT:    vmov.16 q4[0], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[9]
-; CHECK-NEXT:    vmov.16 q4[1], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[10]
-; CHECK-NEXT:    vmov.16 q4[2], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[11]
-; CHECK-NEXT:    vmov.16 q4[3], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[12]
-; CHECK-NEXT:    vmov.16 q4[4], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[13]
-; CHECK-NEXT:    vmov.16 q4[5], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[14]
-; CHECK-NEXT:    vmov.16 q4[6], r5
-; CHECK-NEXT:    vmov.u8 r5, q3[15]
-; CHECK-NEXT:    vmov.16 q4[7], r5
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    adc.w r3, r2, r5
+; CHECK-NEXT:    vmov.u8 r2, q3[8]
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[9]
+; CHECK-NEXT:    vmov.16 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[10]
+; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[11]
+; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[12]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[13]
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[14]
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[15]
+; CHECK-NEXT:    vmov.16 q4[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u16 r5, q1[0]
-; CHECK-NEXT:    vmov.u16 r4, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
-; CHECK-NEXT:    vmov.u16 r5, q1[1]
-; CHECK-NEXT:    vmov.u16 r4, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r5, p0
-; CHECK-NEXT:    and r4, r5, #1
-; CHECK-NEXT:    ubfx r2, r5, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r2
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r2
-; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #8, #1
-; CHECK-NEXT:    ubfx r5, r5, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r5, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[10]
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    ubfx r5, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q2[2], r5
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[8]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.32 q3[0], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q3[1], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
 ; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov.32 q3[3], r5
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adds.w r12, r12, r4
 ; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds.w r12, r2, r4
-; CHECK-NEXT:    vmov.u16 r4, q1[6]
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    vmov.u16 r5, q1[4]
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r5
-; CHECK-NEXT:    vmov.u16 r5, q1[5]
-; CHECK-NEXT:    vmov.u16 r4, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmrs r5, p0
-; CHECK-NEXT:    and r4, r5, #1
-; CHECK-NEXT:    ubfx r2, r5, #4, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    adcs r5, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q2[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vand q2, q3, q2
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r5, s11
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    adc.w r3, r2, r5
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q1[0], r5
+; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    ubfx r5, r2, #4, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov.32 q1[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[12]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov.u8 r5, q0[13]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.32 q2[2], r5
+; CHECK-NEXT:    asrs r5, r5, #31
+; CHECK-NEXT:    vmov.32 q2[3], r5
 ; CHECK-NEXT:    vand q1, q2, q1
 ; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    adds.w r4, r4, r12
-; CHECK-NEXT:    adc.w r12, r3, r2
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    adds.w r12, r12, r4
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    adcs r5, r3
 ; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    ubfx r4, r5, #8, #1
-; CHECK-NEXT:    ubfx r5, r5, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
-; CHECK-NEXT:    rsb.w r5, r5, #0
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[14]
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
-; CHECK-NEXT:    asrs r5, r5, #31
-; CHECK-NEXT:    asrs r4, r4, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r5
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2450,18 +2698,20 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vand q0, q0, q3
 ; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    orr.w r12, r3, r2
@@ -2486,28 +2736,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vand q2, q1, q2
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q1[3], r2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    asrs r2, r2, #31
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2533,19 +2787,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r12, s7
 ; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    orrs.w r3, r3, r12
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index 5906fbc1abf9..ee15f82a71f5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -429,17 +429,19 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q3[2], r0
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT:    vmov.32 q4[2], r1
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r0, s14
@@ -447,163 +449,189 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r12, r1, r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q4[2], r0
 ; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    orr.w lr, r3, r1
 ; CHECK-NEXT:    vmov.u8 r3, q1[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[3]
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    add r2, r12
 ; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r3
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r0
-; CHECK-NEXT:    vmov.u8 r3, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r1
-; CHECK-NEXT:    add.w r1, r2, r12
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, lr
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    umull r0, r3, r0, r3
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov r0, s21
 ; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, lr, r0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    adds.w r12, r1, r0
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    adc.w r12, r0, r4
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[7]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    adc.w r12, r0, r4
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    vmov.u8 r4, q0[7]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[7]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[9]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[8]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    adc.w r12, r0, r4
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[9]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[11]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    adc.w r12, r0, r4
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[11]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[13]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[0], r2
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    adc.w r12, r0, r4
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[13]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[13]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[15]
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r2, r4
+; CHECK-NEXT:    vmov.32 q5[0], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q5[2], r2
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[14]
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    vand q1, q3, q2
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[15]
+; CHECK-NEXT:    vmov.32 q3[2], r3
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -615,140 +643,152 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov.u8 r3, q0[1]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w lr, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    adc.w r12, r0, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adds.w r2, r2, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
 ; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
-; CHECK-NEXT:    adc.w r12, r0, r2
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r1, r3, r3, r1
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r1, s9
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adc.w r1, r1, r12
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adds.w r2, r2, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r2, r3
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[8]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[8]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[9]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r1, r3
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
-; CHECK-NEXT:    adc.w r12, r0, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w r12, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    adds.w r2, r2, lr
-; CHECK-NEXT:    adc.w r0, r0, r12
-; CHECK-NEXT:    adds.w lr, r2, r3
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w r12, r0, r2
+; CHECK-NEXT:    vmov.u8 r2, q1[12]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[12]
-; CHECK-NEXT:    adc.w r12, r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[13]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[13]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r1, r3, r3, r1
-; CHECK-NEXT:    smull r0, r2, r2, r0
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r3, r12, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r3, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r0, r0, r12
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[14]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[14]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
@@ -758,7 +798,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smlal r0, r1, r3, r2
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
@@ -1316,183 +1356,211 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.u8 r2, q1[0]
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[1]
+; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    vmov.i64 q2, #0xff
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
+; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r12, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov.u8 r4, q1[3]
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov.u8 r4, q0[2]
+; CHECK-NEXT:    umull r12, lr, r3, r2
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov.u8 r5, q0[3]
-; CHECK-NEXT:    umull r12, lr, r2, r12
 ; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.32 q4[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[3]
+; CHECK-NEXT:    vmov.32 q4[2], r4
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    orr.w lr, lr, r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q0[2]
-; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[3]
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    add r2, r12
 ; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov r6, s18
 ; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    umull r5, r6, r6, r5
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    adds r2, r2, r5
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r3, r4, r4, r3
+; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, lr, r5
+; CHECK-NEXT:    vmov r5, s22
 ; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[4]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[5]
-; CHECK-NEXT:    adcs r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q0[4]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov.32 q4[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[6]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[7]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds.w r6, r6, r12
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    adc.w r12, r2, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[6]
+; CHECK-NEXT:    vmov.32 q4[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[7]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[8]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[9]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[8]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    adc.w r12, r2, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[8]
+; CHECK-NEXT:    vmov.32 q4[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[10]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[11]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[10]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    adc.w r12, r2, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[10]
+; CHECK-NEXT:    vmov.32 q4[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds.w r12, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[12]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[13]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[12]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    adc.w r12, r2, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[12]
+; CHECK-NEXT:    vmov.32 q4[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q4[2], r4
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r2, r4, r2, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov.32 q5[1], r4
+; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    umull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r4
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r2, r2, r12
+; CHECK-NEXT:    adcs r3, r5
+; CHECK-NEXT:    vmov r5, s22
+; CHECK-NEXT:    adds r2, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[14]
+; CHECK-NEXT:    vmov.32 q3[0], r5
 ; CHECK-NEXT:    vmov.u8 r5, q1[15]
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s14
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[14]
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r6
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov.u8 r5, q0[14]
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    vand q1, q3, q2
+; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    umlal r3, r2, r5, r6
-; CHECK-NEXT:    vmov r6, s6
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    umlal r3, r2, r5, r6
-; CHECK-NEXT:    adds r0, r0, r3
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vand q0, q3, q2
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    umlal r2, r3, r4, r5
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    umlal r2, r3, r4, r5
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -1505,25 +1573,22 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.u8 r2, q1[0]
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull lr, r12, r3, r2
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[1]
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[1]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[1]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    vmov.u8 r4, q1[3]
-; CHECK-NEXT:    smull r2, r3, r2, r3
-; CHECK-NEXT:    vmov.u8 r5, q0[3]
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, lr
-; CHECK-NEXT:    smull r4, r5, r5, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r12
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vmov lr, s10
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    vmov r12, s9
@@ -1534,123 +1599,140 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[4]
-; CHECK-NEXT:    vmov.u8 r2, q1[5]
-; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r5, r4, r4, r5
 ; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r5, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r5, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[6]
-; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.u8 r5, q0[7]
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    adds.w r4, r4, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w lr, r4, r2
+; CHECK-NEXT:    vmov.u8 r4, q1[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
 ; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[8]
-; CHECK-NEXT:    vmov.u8 r2, q1[9]
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[6]
 ; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r5, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r5, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[10]
-; CHECK-NEXT:    vmov.u8 r2, q1[11]
-; CHECK-NEXT:    vmov.u8 r5, q0[11]
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[8]
 ; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r4
+; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r4
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    adds.w r3, r3, lr
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w lr, r3, r4
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    adc.w r12, r2, r5
-; CHECK-NEXT:    vmov.u8 r5, q1[12]
-; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[10]
 ; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r5, r4, r4, r5
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r4
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    adds.w r5, lr, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r5, r5, r4
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds.w lr, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q0[12]
+; CHECK-NEXT:    adc.w r12, r12, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[12]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    smull r2, r4, r2, r4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[13]
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r2, r4, r4, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r12, r12, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    adds r2, r2, r3
+; CHECK-NEXT:    adc.w r3, r12, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[14]
+; CHECK-NEXT:    sxtb.w r12, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[14]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smlal r5, r2, r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[15]
+; CHECK-NEXT:    smlal r2, r3, r4, r12
+; CHECK-NEXT:    vmov.u8 r4, q1[15]
+; CHECK-NEXT:    sxtb.w r12, r4
 ; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smlal r5, r2, r4, r3
-; CHECK-NEXT:    adds r0, r0, r5
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    smlal r2, r3, r4, r12
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 74f2b2c4f537..72462bb87f02 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -52,17 +52,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmullb.u32 q3, q0, q1
-; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -86,17 +88,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmullb.s32 q3, q0, q1
-; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -285,37 +289,44 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
 ; CHECK-LABEL: add_v2i16_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vand q4, q0, q3
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov r1, s18
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -331,32 +342,36 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2
 ; CHECK-LABEL: add_v2i16_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vand q3, q2, q3
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    sxth r3, r3
 ; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    sxth r1, r1
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r1
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -590,20 +605,22 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #40
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    .pad #88
+; CHECK-NEXT:    sub sp, #88
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vpsel q5, q1, q0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
-; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i64 q4, #0xff
 ; CHECK-NEXT:    vmov.16 q2[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
@@ -619,149 +636,181 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vmov.u8 r3, q4[1]
-; CHECK-NEXT:    vpsel q6, q3, q0
+; CHECK-NEXT:    vpsel q6, q1, q0
 ; CHECK-NEXT:    vmov.u16 r0, q6[0]
-; CHECK-NEXT:    vmov.u16 r1, q6[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q6[1]
-; CHECK-NEXT:    vmov.u16 r1, q6[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u8 r2, q1[0]
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
-; CHECK-NEXT:    vmov q7[3], q7[1], r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov.32 q7[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q7[2], r1
+; CHECK-NEXT:    vmov.32 q7[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[0]
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[1]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vand q2, q0, q4
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q4[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    umull r1, r12, r2, r1
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    umull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vand q1, q0, q4
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    umull r1, r2, r2, r1
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    umull r1, r2, r2, r1
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vand q0, q0, q7
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w lr, r2, r1
-; CHECK-NEXT:    vmov.u8 r1, q4[3]
-; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
-; CHECK-NEXT:    ubfx r0, r0, #12, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    ubfx r0, r0, #12, #1
+; CHECK-NEXT:    vmov.32 q7[0], r3
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q7[2], q7[0], r0, r3
-; CHECK-NEXT:    vmov q7[3], q7[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.u8 r3, q1[3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    vmov.u8 r3, q4[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r3
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q2[2]
+; CHECK-NEXT:    vmov.32 q7[2], r0
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q7[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.u8 r3, q2[3]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov q7, q4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q7
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    adcs r2, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q4[5]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q6[4]
-; CHECK-NEXT:    vmov.u16 r3, q6[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[5]
-; CHECK-NEXT:    vmov.u16 r3, q6[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
+; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
+; CHECK-NEXT:    and r3, lr, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[4]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[5]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov.u8 r0, q3[4]
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    umull r0, r3, r0, r3
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q6
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    adds r3, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q3[6]
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    ubfx r0, lr, #8, #1
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.u8 r3, q1[5]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    vmov.u8 r3, q4[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r3
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    ubfx r0, lr, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[7]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r1
-; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q4[7]
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.u8 r3, q1[7]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r3, q4[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q6
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
@@ -779,152 +828,182 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q6[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[15]
 ; CHECK-NEXT:    vmov.16 q6[7], r2
-; CHECK-NEXT:    adc.w lr, r1, r3
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
-; CHECK-NEXT:    vmov.u8 r0, q7[9]
-; CHECK-NEXT:    vpsel q3, q3, q0
+; CHECK-NEXT:    vmov.u8 r0, q7[8]
+; CHECK-NEXT:    vpsel q3, q1, q0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r2, q3[0]
-; CHECK-NEXT:    vmov.u16 r3, q3[2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.u8 r0, q7[9]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.u16 r3, q3[3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    and r3, lr, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[8]
-; CHECK-NEXT:    vmov.u8 r4, q1[9]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q7[8]
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r4
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q5, q5, q2
+; CHECK-NEXT:    vmov.32 q4[0], r3
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q4[2], r3
+; CHECK-NEXT:    vmov.32 q4[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q6[8]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q6[9]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vand q0, q0, q5
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r1, s22
 ; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    umull r1, r4, r1, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q4
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    adds r3, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q7[10]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q7[11]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    ubfx r0, lr, #8, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    ubfx r0, lr, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vand q0, q0, q5
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q7[11]
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.u8 r3, q1[11]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r3, q7[10]
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r3
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q5, q5, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q4
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    adc.w lr, r1, r3
-; CHECK-NEXT:    vmov.u16 r3, q3[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.u16 r3, q3[7]
-; CHECK-NEXT:    vmov.u8 r0, q7[13]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
+; CHECK-NEXT:    vmov.u8 r0, q7[12]
+; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q7[13]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    and r3, lr, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r3, q1[12]
-; CHECK-NEXT:    vmov.u8 r4, q1[13]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q7[12]
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r4
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q6[12]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q6[13]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vand q0, q0, q5
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r1, s18
 ; CHECK-NEXT:    umull r0, r3, r0, r3
-; CHECK-NEXT:    umull r1, r4, r1, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q3
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r3, lr, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    adds r3, r3, r2
+; CHECK-NEXT:    vmov.u8 r2, q7[14]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q7[15]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    adcs r1, r0
+; CHECK-NEXT:    ubfx r0, lr, #8, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    ubfx r0, lr, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vand q0, q0, q5
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    adc.w r1, r3, r4
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r4, q7[15]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov.u8 r3, q1[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r3, q7[14]
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    umull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q2, q3
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds.w r2, r2, r12
 ; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #40
+; CHECK-NEXT:    add sp, #88
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -938,18 +1017,15 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    vmov.u8 r3, q1[1]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    vmov.u8 r0, q4[0]
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
 ; CHECK-NEXT:    vmov.16 q5[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q4[1]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
@@ -965,128 +1041,147 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q5[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q4[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vmov.u16 r0, q5[0]
-; CHECK-NEXT:    vmov.u16 r1, q5[2]
-; CHECK-NEXT:    vmov q6[2], q6[0], r1, r0
+; CHECK-NEXT:    vmov.32 q6[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[1]
-; CHECK-NEXT:    vmov.u16 r1, q5[3]
-; CHECK-NEXT:    vmov q6[3], q6[1], r1, r0
+; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.32 q6[3], r0
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r0, p0
 ; CHECK-NEXT:    and r1, r0, #1
-; CHECK-NEXT:    ubfx r2, r0, #4, #1
 ; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r1
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r1
+; CHECK-NEXT:    vmov.32 q6[0], r1
+; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    ubfx r1, r0, #4, #1
+; CHECK-NEXT:    rsbs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    vmov.32 q6[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
 ; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r1, r12, r2, r1
+; CHECK-NEXT:    smull r1, r2, r2, r1
+; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
+; CHECK-NEXT:    vmov.32 q7[1], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q7[2], q7[0], r2, r1
-; CHECK-NEXT:    vmov q7[3], q7[1], r3, r12
+; CHECK-NEXT:    smull r1, r2, r2, r1
+; CHECK-NEXT:    vmov.32 q7[2], r1
+; CHECK-NEXT:    vmov.32 q7[3], r2
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov r1, s24
+; CHECK-NEXT:    vmov r1, s26
+; CHECK-NEXT:    vmov r2, s24
 ; CHECK-NEXT:    vmov r12, s27
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    adds.w lr, r1, r3
+; CHECK-NEXT:    vmov r3, s25
+; CHECK-NEXT:    adds r1, r1, r2
+; CHECK-NEXT:    adc.w r2, r3, r12
 ; CHECK-NEXT:    ubfx r3, r0, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r0, r0, #12, #1
-; CHECK-NEXT:    rsb.w r3, r3, #0
-; CHECK-NEXT:    rsb.w r0, r0, #0
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
-; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    smull r1, r2, r2, r1
-; CHECK-NEXT:    vmov q7[2], q7[0], r1, r0
-; CHECK-NEXT:    vmov q7[3], q7[1], r2, r3
+; CHECK-NEXT:    vmov.32 q7[2], r0
+; CHECK-NEXT:    vmov.32 q7[3], r3
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r1, s24
+; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r0, s25
+; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    adds.w r1, r1, lr
-; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    adcs r2, r0
 ; CHECK-NEXT:    vmov r0, s26
 ; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.u16 r3, q5[6]
-; CHECK-NEXT:    sxtb r1, r1
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.u16 r3, q5[7]
-; CHECK-NEXT:    smull r1, r4, r4, r1
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r0, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r3
-; CHECK-NEXT:    vmov q5[3], q5[1], r0, r3
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    vmov.32 q5[0], r3
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q5[2], r3
+; CHECK-NEXT:    vmov.32 q5[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[4]
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r0, r3, r0, r3
+; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    smull r0, r3, r3, r0
-; CHECK-NEXT:    vmov q6[2], q6[0], r1, r0
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q6[3], r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[7]
-; CHECK-NEXT:    adc.w r12, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r3
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r1, s22
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    adds r3, r3, r1
+; CHECK-NEXT:    adc.w r1, r12, r0
+; CHECK-NEXT:    ubfx r0, r2, #8, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    ubfx r0, r2, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.u8 r2, q0[6]
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    smull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r2
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -1103,133 +1198,155 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
-; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r3, q2[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.u16 r3, q2[3]
-; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r3
+; CHECK-NEXT:    vmov.32 q3[0], r3
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q3[2], r3
+; CHECK-NEXT:    vmov.32 q3[3], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[8]
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r4
+; CHECK-NEXT:    smull r0, r3, r0, r3
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[9]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r3
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[11]
-; CHECK-NEXT:    adc.w r12, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    adds r3, r3, r1
+; CHECK-NEXT:    adc.w r1, r12, r0
+; CHECK-NEXT:    ubfx r0, r2, #8, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    ubfx r0, r2, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.32 q4[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    smull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q4[2], q4[0], r0, r2
-; CHECK-NEXT:    vmov q4[3], q4[1], r4, r3
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r2
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    adds.w r12, r1, r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    adc.w lr, r2, r3
+; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.u16 r3, q2[6]
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.u16 r3, q2[7]
-; CHECK-NEXT:    sxtb r0, r0
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
-; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
-; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    ubfx r4, r2, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r4, r3
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[12]
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r4
+; CHECK-NEXT:    smull r0, r3, r0, r3
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[13]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r0, r3, r3, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r3
 ; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    adds.w r1, r1, r12
-; CHECK-NEXT:    adc.w r0, r0, lr
-; CHECK-NEXT:    adds r1, r1, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[15]
-; CHECK-NEXT:    adc.w r12, r0, r3
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov.u8 r3, q0[14]
+; CHECK-NEXT:    adds.w r3, r3, r12
+; CHECK-NEXT:    adc.w r12, r1, r0
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    adds r3, r3, r1
+; CHECK-NEXT:    adc.w r1, r12, r0
+; CHECK-NEXT:    ubfx r0, r2, #8, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    ubfx r0, r2, #12, #1
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[15]
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, r3, r3, r2
-; CHECK-NEXT:    smull r0, r4, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    smull r0, r2, r2, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    adds r1, r1, r2
-; CHECK-NEXT:    adc.w r2, r12, r0
+; CHECK-NEXT:    adcs r1, r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -1243,37 +1360,44 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
 ; CHECK-LABEL: add_v2i8_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vand q4, q0, q3
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    umull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov r1, s18
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -1289,32 +1413,36 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i
 ; CHECK-LABEL: add_v2i8_v2i64_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vand q3, q2, q3
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    cset r0, eq
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    smull r0, r1, r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    sxtb r0, r0
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    smull r0, r1, r1, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r1
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r1, s3
@@ -1335,38 +1463,40 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
 ; CHECK-LABEL: add_v2i64_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r4, r5, r2, r3
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, lr
-; CHECK-NEXT:    vmov r4, s5
-; CHECK-NEXT:    mla r1, r1, r4, r12
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    mla r0, r4, r0, r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    mla r1, r2, r1, r5
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    umull r12, r3, r1, r0
+; CHECK-NEXT:    mla r1, r1, r2, r3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.32 q3[0], r12
+; CHECK-NEXT:    mla r0, r2, r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    umull r2, r3, r1, r0
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    mla r1, r1, r2, r3
 ; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    mla r1, r2, r3, r1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    mla r0, r2, r0, r1
 ; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s11
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s11
 ; CHECK-NEXT:    cset r0, eq
 ; CHECK-NEXT:    tst.w r0, #1
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    cset r1, eq
-; CHECK-NEXT:    tst.w r1, #1
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1374,7 +1504,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
@@ -1439,17 +1569,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    vmullb.u32 q3, q0, q1
-; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1478,17 +1610,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y,
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    vmullb.s32 q3, q0, q1
-; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vand q0, q3, q0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
@@ -1692,31 +1826,37 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vand q4, q0, q3
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    umull lr, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    umull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -1725,6 +1865,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    adc.w r3, lr, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
@@ -1743,32 +1884,36 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y,
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vand q3, q2, q3
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    sxth r3, r3
-; CHECK-NEXT:    smull lr, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxth r2, r2
-; CHECK-NEXT:    smull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2021,16 +2166,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    .pad #80
+; CHECK-NEXT:    sub sp, #80
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vpsel q5, q1, q0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
-; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i64 q4, #0xff
 ; CHECK-NEXT:    vmov.16 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r2
@@ -2046,152 +2193,184 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q2[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vpsel q6, q3, q0
+; CHECK-NEXT:    vpsel q6, q1, q0
 ; CHECK-NEXT:    vmov.u16 r2, q6[0]
-; CHECK-NEXT:    vmov.u16 r3, q6[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[1]
-; CHECK-NEXT:    vmov.u16 r3, q6[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[2]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[3]
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
 ; CHECK-NEXT:    and r3, lr, #1
-; CHECK-NEXT:    ubfx r2, lr, #4, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q7[2], q7[0], r2, r3
-; CHECK-NEXT:    vmov q7[3], q7[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    vmov.32 q7[0], r3
+; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q7[2], r3
+; CHECK-NEXT:    vmov.32 q7[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[0]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[1]
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
+; CHECK-NEXT:    vand q2, q0, q4
+; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[1]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov.u8 r3, q4[0]
-; CHECK-NEXT:    vmov.u8 r2, q4[1]
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    umull r2, r12, r2, r12
-; CHECK-NEXT:    umull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r12
+; CHECK-NEXT:    vmov r12, s8
+; CHECK-NEXT:    vmov.32 q0[2], r3
+; CHECK-NEXT:    vand q1, q0, q4
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r3, r2, r3, r12
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r3
 ; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r12, s3
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    adds r6, r2, r3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    adds r4, r4, r2
 ; CHECK-NEXT:    ubfx r2, lr, #8, #1
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q4[3]
-; CHECK-NEXT:    adc.w r12, r12, r4
-; CHECK-NEXT:    ubfx r4, lr, #12, #1
-; CHECK-NEXT:    rsbs r4, r4, #0
-; CHECK-NEXT:    vmov q7[2], q7[0], r4, r2
-; CHECK-NEXT:    vmov q7[3], q7[1], r4, r2
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    vmov.u8 r4, q1[3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
-; CHECK-NEXT:    vmov.u8 r4, q4[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r4
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.32 q7[0], r2
+; CHECK-NEXT:    adc.w r12, r12, r3
+; CHECK-NEXT:    vmov.32 q7[1], r2
+; CHECK-NEXT:    ubfx r2, lr, #12, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r3, q2[2]
+; CHECK-NEXT:    vmov.32 q7[2], r2
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q7[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[2]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q3[3]
+; CHECK-NEXT:    vmov.u8 r3, q2[3]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull r2, r3, r3, r2
-; CHECK-NEXT:    umull r5, r4, r5, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vmov.u8 r4, q4[5]
-; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmov q7, q4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q7
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
-; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    adc.w lr, r12, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    adds.w r12, r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[4]
-; CHECK-NEXT:    vmov.u16 r6, q6[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q6[5]
-; CHECK-NEXT:    vmov.u16 r6, q6[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q6[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    adc.w lr, lr, r4
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r6, r2, #1
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r5, r6
-; CHECK-NEXT:    vmov q6[3], q6[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[4]
-; CHECK-NEXT:    vmov.u8 r5, q1[5]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q4[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r5
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    vmrs r6, p0
+; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
+; CHECK-NEXT:    and r4, r6, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q6[0], r4
+; CHECK-NEXT:    vmov.32 q6[1], r4
+; CHECK-NEXT:    ubfx r4, r6, #4, #1
+; CHECK-NEXT:    rsbs r4, r4, #0
+; CHECK-NEXT:    vmov.32 q6[2], r4
+; CHECK-NEXT:    vmov.32 q6[3], r4
+; CHECK-NEXT:    vmov.u8 r4, q3[4]
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.u8 r4, q3[5]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q0[2], r4
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov.u8 r3, q3[4]
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[5]
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vand q1, q1, q4
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.32 q2[1], r4
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    umull r3, r4, r4, r3
+; CHECK-NEXT:    vmov.32 q2[2], r3
+; CHECK-NEXT:    vmov.32 q2[3], r4
+; CHECK-NEXT:    vand q0, q2, q6
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    ubfx r6, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q4[7]
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r6
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r3, r3, lr
+; CHECK-NEXT:    adds r4, r4, r5
+; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    ubfx r2, r6, #8, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.u8 r3, q3[6]
+; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    ubfx r2, r6, #12, #1
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.u8 r6, q1[7]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
-; CHECK-NEXT:    vmov.u8 r6, q4[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.u8 r3, q3[7]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q1[2], r3
+; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vand q1, q1, q4
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
-; CHECK-NEXT:    vmov.u8 r4, q7[9]
-; CHECK-NEXT:    vand q0, q0, q6
-; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    vand q0, q2, q6
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
 ; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    adds r3, r3, r4
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[8]
+; CHECK-NEXT:    adc.w r3, r2, r6
 ; CHECK-NEXT:    vmov.u8 r2, q5[8]
 ; CHECK-NEXT:    vmov.16 q6[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[9]
@@ -2209,69 +2388,84 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u8 r2, q5[15]
 ; CHECK-NEXT:    vmov.16 q6[7], r2
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
-; CHECK-NEXT:    vpsel q3, q3, q0
+; CHECK-NEXT:    vpsel q3, q1, q0
+; CHECK-NEXT:    vmov.32 q1[0], r5
 ; CHECK-NEXT:    vmov.u16 r2, q3[0]
-; CHECK-NEXT:    vmov.u16 r6, q3[2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.u8 r5, q7[9]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.u16 r6, q3[3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.32 q1[2], r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    and r6, r2, #1
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
 ; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q4[2], q4[0], r5, r6
-; CHECK-NEXT:    vmov q4[3], q4[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[8]
-; CHECK-NEXT:    vmov.u8 r5, q1[9]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[8]
-; CHECK-NEXT:    vmov q5[2], q5[0], r4, r5
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q5, q5, q2
+; CHECK-NEXT:    vmov.32 q4[0], r6
+; CHECK-NEXT:    vmov.32 q4[1], r6
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov.32 q4[2], r6
+; CHECK-NEXT:    vmov.32 q4[3], r6
+; CHECK-NEXT:    vmov.u8 r6, q6[8]
+; CHECK-NEXT:    vmov.32 q0[0], r6
+; CHECK-NEXT:    vmov.u8 r6, q6[9]
+; CHECK-NEXT:    vmov.32 q0[2], r6
+; CHECK-NEXT:    vand q0, q0, q5
 ; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r5, s20
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r3, s22
 ; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov.32 q2[0], r6
+; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    vmov.32 q2[2], r6
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    vand q0, q2, q4
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r6, s1
 ; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
+; CHECK-NEXT:    adds.w r5, r5, r12
+; CHECK-NEXT:    adcs r6, r3
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adc.w r12, r6, r4
 ; CHECK-NEXT:    ubfx r6, r2, #8, #1
+; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[11]
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r6
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.u8 r6, q1[11]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.32 q4[0], r6
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q4[1], r6
 ; CHECK-NEXT:    vmov.u8 r6, q7[10]
-; CHECK-NEXT:    vmov q5[2], q5[0], r5, r6
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q5, q5, q2
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q1[0], r6
+; CHECK-NEXT:    vmov.32 q4[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q6[10]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q6[11]
+; CHECK-NEXT:    vmov.u8 r6, q7[11]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q1[2], r6
+; CHECK-NEXT:    vand q0, q0, q5
+; CHECK-NEXT:    vand q1, q1, q5
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r6, s20
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r6, s4
 ; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
-; CHECK-NEXT:    vmov.u8 r4, q7[13]
-; CHECK-NEXT:    vand q0, q0, q4
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r6
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    umull r2, r6, r6, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r6
+; CHECK-NEXT:    vand q0, q2, q4
 ; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
@@ -2279,68 +2473,83 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r6, s3
 ; CHECK-NEXT:    adc.w r2, r2, r12
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    vmov.u8 r5, q7[12]
+; CHECK-NEXT:    vmov.32 q1[0], r5
+; CHECK-NEXT:    vmov.u8 r5, q7[13]
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vand q1, q1, q5
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    adc.w r3, r2, r6
 ; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.u16 r6, q3[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.u16 r6, q3[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    and r6, r2, #1
-; CHECK-NEXT:    ubfx r5, r2, #4, #1
 ; CHECK-NEXT:    rsbs r6, r6, #0
-; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[12]
-; CHECK-NEXT:    vmov.u8 r5, q1[13]
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[12]
-; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov.32 q3[0], r6
+; CHECK-NEXT:    vmov.32 q3[1], r6
+; CHECK-NEXT:    ubfx r6, r2, #4, #1
+; CHECK-NEXT:    rsbs r6, r6, #0
+; CHECK-NEXT:    vmov.32 q3[2], r6
+; CHECK-NEXT:    vmov.32 q3[3], r6
+; CHECK-NEXT:    vmov.u8 r6, q6[12]
+; CHECK-NEXT:    vmov.32 q0[0], r6
+; CHECK-NEXT:    vmov.u8 r6, q6[13]
+; CHECK-NEXT:    vmov.32 q0[2], r6
+; CHECK-NEXT:    vand q0, q0, q5
 ; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    umull r6, r5, r5, r6
-; CHECK-NEXT:    umull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r6
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q3
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov.32 q2[0], r6
+; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    umull r6, r5, r5, r6
+; CHECK-NEXT:    vmov.32 q2[2], r6
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    vand q0, q2, q3
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r6, s1
 ; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adc.w r5, lr, r3
+; CHECK-NEXT:    adds.w r5, r5, r12
+; CHECK-NEXT:    adcs r6, r3
 ; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    adds r3, r3, r6
+; CHECK-NEXT:    adds r3, r3, r5
+; CHECK-NEXT:    adc.w r12, r6, r4
 ; CHECK-NEXT:    ubfx r6, r2, #8, #1
+; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsb.w r6, r6, #0
-; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w r12, r5, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r6
-; CHECK-NEXT:    vmov.u8 r5, q7[15]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r6
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov.u8 r6, q1[15]
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
+; CHECK-NEXT:    vmov.32 q3[0], r6
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q3[1], r6
 ; CHECK-NEXT:    vmov.u8 r6, q7[14]
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r6
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q1[0], r6
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q6[14]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q6[15]
+; CHECK-NEXT:    vmov.u8 r6, q7[15]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q1[2], r6
+; CHECK-NEXT:    vand q0, q0, q5
+; CHECK-NEXT:    vand q1, q1, q5
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r6, s4
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    vmov r4, s6
 ; CHECK-NEXT:    umull r2, r6, r6, r2
-; CHECK-NEXT:    umull r5, r4, r4, r5
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.32 q2[1], r6
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    umull r2, r6, r6, r2
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r6
+; CHECK-NEXT:    vand q0, q2, q3
 ; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r5, s2
@@ -2351,7 +2560,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r2, r6
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
@@ -2368,18 +2577,15 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    vmov.u8 r4, q0[1]
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov.u8 r2, q4[0]
-; CHECK-NEXT:    vmov.u8 r5, q0[3]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[1]
 ; CHECK-NEXT:    vmov.16 q5[1], r2
@@ -2395,132 +2601,151 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    sxtb r5, r5
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q5, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q5[0]
-; CHECK-NEXT:    vmov.u16 r3, q5[2]
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[1]
-; CHECK-NEXT:    vmov.u16 r3, q5[3]
-; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[2]
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[3]
+; CHECK-NEXT:    vmov.32 q6[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmrs r12, p0
-; CHECK-NEXT:    and r3, r12, #1
-; CHECK-NEXT:    ubfx r2, r12, #4, #1
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r3
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r3
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
+; CHECK-NEXT:    vmov.32 q6[0], r3
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    vmov.32 q6[2], r3
+; CHECK-NEXT:    vmov.32 q6[3], r3
+; CHECK-NEXT:    vmov.u8 r3, q1[0]
+; CHECK-NEXT:    sxtb.w r12, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
-; CHECK-NEXT:    sxtb r2, r2
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r2, lr, r3, r2
+; CHECK-NEXT:    smull r3, r12, r3, r12
+; CHECK-NEXT:    vmov.32 q7[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q1[1]
+; CHECK-NEXT:    vmov.32 q7[1], r12
+; CHECK-NEXT:    sxtb.w r12, r3
+; CHECK-NEXT:    vmov.u8 r3, q0[1]
 ; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r4, r4, r3
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
-; CHECK-NEXT:    vmov q7[3], q7[1], r4, lr
+; CHECK-NEXT:    smull r3, r12, r3, r12
+; CHECK-NEXT:    vmov.32 q7[2], r3
+; CHECK-NEXT:    vmov.32 q7[3], r12
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    vmov r4, s26
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov lr, s27
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    adds r6, r2, r4
-; CHECK-NEXT:    ubfx r4, r12, #8, #1
-; CHECK-NEXT:    ubfx r2, r12, #12, #1
-; CHECK-NEXT:    rsb.w r4, r4, #0
+; CHECK-NEXT:    vmov r3, s26
+; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r12, s27
+; CHECK-NEXT:    vmov r5, s25
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsb.w r3, r3, #0
+; CHECK-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-NEXT:    vmov.32 q6[0], r3
 ; CHECK-NEXT:    rsb.w r2, r2, #0
-; CHECK-NEXT:    adc.w lr, lr, r3
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r4
-; CHECK-NEXT:    vmov.u8 r3, q1[3]
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r4
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
-; CHECK-NEXT:    vmov.u8 r4, q0[2]
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[2]
+; CHECK-NEXT:    vmov.32 q6[2], r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[2]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    smull r2, r4, r4, r2
-; CHECK-NEXT:    smull r3, r5, r5, r3
-; CHECK-NEXT:    vmov q7[2], q7[0], r3, r2
-; CHECK-NEXT:    vmov q7[3], q7[1], r5, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[5]
+; CHECK-NEXT:    adc.w r12, r12, r5
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q7[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    vmov.32 q7[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[3]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q7[2], r2
+; CHECK-NEXT:    vmov.32 q7[3], r3
 ; CHECK-NEXT:    vand q6, q7, q6
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s24
 ; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov r5, s26
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s27
-; CHECK-NEXT:    adc.w r2, r2, lr
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    vmov.u8 r3, q0[5]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vmov r5, s27
+; CHECK-NEXT:    adds.w r3, r3, lr
+; CHECK-NEXT:    adc.w r2, r2, r12
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    adc.w r3, r2, r5
 ; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.u16 r6, q5[6]
-; CHECK-NEXT:    vmov q6[2], q6[0], r6, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[5]
-; CHECK-NEXT:    vmov.u16 r6, q5[7]
-; CHECK-NEXT:    vmov q6[3], q6[1], r6, r2
+; CHECK-NEXT:    vmov.32 q6[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[6]
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[7]
+; CHECK-NEXT:    vmov.32 q6[3], r2
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i32 ne, q6, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r6, r2, #1
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q5[0], r5
+; CHECK-NEXT:    vmov.32 q5[1], r5
 ; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r5, r6
-; CHECK-NEXT:    vmov q5[3], q5[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[4]
-; CHECK-NEXT:    vmov.u8 r5, q0[4]
-; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    vmov.32 q5[2], r5
+; CHECK-NEXT:    vmov.32 q5[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[4]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q6[0], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[5]
+; CHECK-NEXT:    vmov.32 q6[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[5]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r6, r5, r5, r6
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r6
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q6[2], r5
+; CHECK-NEXT:    vmov.32 q6[3], r4
 ; CHECK-NEXT:    vand q5, q6, q5
 ; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    vmov r5, s23
-; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov r5, s21
+; CHECK-NEXT:    adds.w r12, r12, r4
 ; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    adds r6, r6, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[7]
-; CHECK-NEXT:    adc.w r12, r3, r5
-; CHECK-NEXT:    ubfx r5, r2, #8, #1
+; CHECK-NEXT:    adcs r5, r3
+; CHECK-NEXT:    vmov r3, s23
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q5[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q0[7]
-; CHECK-NEXT:    vmov q5[2], q5[0], r2, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r5
-; CHECK-NEXT:    vmov.u8 r2, q1[6]
-; CHECK-NEXT:    vmov.u8 r5, q0[6]
+; CHECK-NEXT:    vmov.32 q5[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    vmov.32 q5[2], r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q5[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q1[9]
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q6[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q6[2], r2
+; CHECK-NEXT:    vmov.32 q6[3], r3
 ; CHECK-NEXT:    vand q5, q6, q5
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s23
+; CHECK-NEXT:    vmov r5, s23
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s22
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    vmov.u8 r3, q0[9]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    adc.w r3, r2, r5
 ; CHECK-NEXT:    vmov.u8 r2, q4[8]
 ; CHECK-NEXT:    vmov.16 q5[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[9]
@@ -2537,131 +2762,155 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q5[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q4[15]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vpsel q2, q3, q2
 ; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r6, q2[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[1]
-; CHECK-NEXT:    vmov.u16 r6, q2[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.32 q3[3], r2
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r6, r2, #1
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q3[0], r5
+; CHECK-NEXT:    vmov.32 q3[1], r5
 ; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[8]
-; CHECK-NEXT:    vmov.u8 r5, q0[8]
-; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q3[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[8]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q4[0], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[9]
+; CHECK-NEXT:    vmov.32 q4[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[9]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r6, r5, r5, r6
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r6
-; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q4[2], r5
+; CHECK-NEXT:    vmov.32 q4[3], r4
 ; CHECK-NEXT:    vand q3, q4, q3
 ; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r3, s13
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    adds.w r12, r12, r4
 ; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    adds r6, r6, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[11]
-; CHECK-NEXT:    adc.w r12, r3, r5
-; CHECK-NEXT:    ubfx r5, r2, #8, #1
+; CHECK-NEXT:    adcs r5, r3
+; CHECK-NEXT:    vmov r3, s15
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q0[11]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r5
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.u8 r5, q0[10]
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[10]
+; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[10]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
-; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
-; CHECK-NEXT:    vmov.u8 r4, q1[13]
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q4[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.32 q4[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[11]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q4[2], r2
+; CHECK-NEXT:    vmov.32 q4[3], r3
 ; CHECK-NEXT:    vand q3, q4, q3
-; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s15
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s14
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    vmov.u8 r3, q0[13]
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    adc.w lr, r2, r6
+; CHECK-NEXT:    adds.w r12, r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
+; CHECK-NEXT:    adc.w r3, r2, r5
 ; CHECK-NEXT:    vmov.u16 r2, q2[4]
-; CHECK-NEXT:    vmov.u16 r6, q2[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.u16 r6, q2[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r6, r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    sxtb r4, r4
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    and r6, r2, #1
+; CHECK-NEXT:    and r5, r2, #1
+; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    vmov.32 q2[1], r5
 ; CHECK-NEXT:    ubfx r5, r2, #4, #1
-; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    rsbs r5, r5, #0
-; CHECK-NEXT:    vmov q2[2], q2[0], r5, r6
-; CHECK-NEXT:    vmov q2[3], q2[1], r5, r6
-; CHECK-NEXT:    vmov.u8 r6, q1[12]
-; CHECK-NEXT:    vmov.u8 r5, q0[12]
-; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    vmov.32 q2[2], r5
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[12]
 ; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r6, r5, r5, r6
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r6
-; CHECK-NEXT:    vmov q3[3], q3[1], r4, r5
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q3[0], r5
+; CHECK-NEXT:    vmov.u8 r5, q1[13]
+; CHECK-NEXT:    vmov.32 q3[1], r4
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    smull r5, r4, r4, r5
+; CHECK-NEXT:    vmov.32 q3[2], r5
+; CHECK-NEXT:    vmov.32 q3[3], r4
 ; CHECK-NEXT:    vand q2, q3, q2
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds.w r6, r12, r4
+; CHECK-NEXT:    vmov r5, s9
+; CHECK-NEXT:    adds.w r12, r12, r4
 ; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    adc.w r3, r3, lr
-; CHECK-NEXT:    adds r6, r6, r4
-; CHECK-NEXT:    vmov.u8 r4, q1[15]
-; CHECK-NEXT:    adc.w r12, r3, r5
-; CHECK-NEXT:    ubfx r5, r2, #8, #1
+; CHECK-NEXT:    adcs r5, r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    adds.w r4, r4, r12
+; CHECK-NEXT:    adc.w r12, r5, r3
+; CHECK-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r5, r5, #0
+; CHECK-NEXT:    vmov.32 q2[0], r3
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vmov.u8 r3, q0[15]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r5
-; CHECK-NEXT:    sxtb r4, r4
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r5
-; CHECK-NEXT:    vmov.u8 r2, q1[14]
-; CHECK-NEXT:    vmov.u8 r5, q0[14]
+; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[14]
+; CHECK-NEXT:    vmov.32 q2[2], r2
 ; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[14]
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r5, r5
-; CHECK-NEXT:    smull r2, r5, r5, r2
-; CHECK-NEXT:    smull r3, r4, r3, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[15]
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov.u8 r3, q0[15]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r5, s2
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    adc.w r2, r2, r12
-; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    adcs r2, r6
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    adcs r2, r5
 ; CHECK-NEXT:    adds r0, r0, r3
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -2678,31 +2927,37 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i64 q3, #0xff
 ; CHECK-NEXT:    vand q1, q1, q3
-; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vand q4, q0, q3
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    umull lr, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    vand q1, q2, q3
-; CHECK-NEXT:    umull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    umull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2711,6 +2966,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    adc.w r3, lr, r12
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
@@ -2729,32 +2985,36 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q3, #0xff
-; CHECK-NEXT:    vand q2, q2, q3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vand q3, q2, q3
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    sxtb r3, r3
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r2
 ; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    sxtb r3, r3
-; CHECK-NEXT:    smull lr, r12, r3, r2
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.32 q3[1], r3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    sxtb r2, r2
-; CHECK-NEXT:    smull r2, r3, r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
-; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.32 q3[3], r3
+; CHECK-NEXT:    vand q0, q3, q2
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov r12, s3
@@ -2778,48 +3038,52 @@ entry:
 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
 ; CHECK-LABEL: add_v2i64_v2i64_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    vmov r4, s5
 ; CHECK-NEXT:    umull r12, lr, r3, r2
-; CHECK-NEXT:    umull r6, r7, r5, r4
-; CHECK-NEXT:    vmov q3[2], q3[0], r6, r12
-; CHECK-NEXT:    vmov r6, s5
-; CHECK-NEXT:    mla r3, r3, r6, lr
-; CHECK-NEXT:    vmov r6, s1
-; CHECK-NEXT:    mla r2, r6, r2, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    mla r3, r5, r3, r7
-; CHECK-NEXT:    vmov r7, s3
-; CHECK-NEXT:    mla r3, r7, r4, r3
-; CHECK-NEXT:    vmov r7, s10
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    mla r3, r3, r4, lr
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov.32 q3[0], r12
+; CHECK-NEXT:    mla r2, r4, r2, r3
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    umull r4, r12, r3, r2
+; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    mla r3, r3, r4, r12
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    mla r2, r4, r2, r3
 ; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, s11
 ; CHECK-NEXT:    cset r2, eq
 ; CHECK-NEXT:    tst.w r2, #1
 ; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    orrs r3, r7
-; CHECK-NEXT:    cset r3, eq
-; CHECK-NEXT:    tst.w r3, #1
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vand q0, q3, q0
-; CHECK-NEXT:    vmov r7, s2
-; CHECK-NEXT:    vmov r6, s0
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    adds r7, r7, r6
-; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    adds r0, r0, r7
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r3, r3, r12
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adcs r1, r3
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
index 29136b0429d0..0a5f21058687 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
@@ -76,20 +76,22 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    vmov lr, s1
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r12, s7
+; CHECK-NEXT:    vmov lr, s3
 ; CHECK-NEXT:    adds r6, r3, r2
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adc.w r12, r12, lr
 ; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r6
+; CHECK-NEXT:    vmov.32 q0[0], r5
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r6
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index 26aa66b03ca2..7c8c0bae6bec 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -269,20 +269,22 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r12, s5
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r12, s7
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -297,58 +299,62 @@ entry:
 define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld2_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s22
-; CHECK-NEXT:    vmov.f32 s2, s20
-; CHECK-NEXT:    vmov.f32 s11, s23
-; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r12, s9
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s6, s16
-; CHECK-NEXT:    vmov.f32 s7, s17
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d8, d7
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s21
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov r12, s19
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    adcs r0, r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov q3[2], q3[0], r5, r3
-; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s10
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
-; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov.32 q3[1], r2
 ; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.32 q3[2], lr
+; CHECK-NEXT:    vmov.32 q3[3], r12
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    adds r4, r4, r6
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, lr
-; CHECK-NEXT:    adcs r0, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    adds.w lr, r4, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    adc.w r12, r2, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 4
   %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 46a02a692399..ad7f80560195 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -293,24 +293,30 @@ define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov.u16 r3, q0[6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.u16 r3, q0[7]
 ; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r2
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vadd.i32 q0, q1, q2
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -328,78 +334,82 @@ entry:
 define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.f32 s22, s8
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.f32 s22, s12
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov q3, q5
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmovnb.i32 q3, q4
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov q3[2], q3[0], r0, r2
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmovnb.i32 q6, q4
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
 ; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r2
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
-; CHECK-NEXT:    vadd.i16 q0, q4, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
+; CHECK-NEXT:    vmovnb.i32 q2, q5
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vadd.i16 q1, q4, q3
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x i16>, <24 x i16>* %src, align 4
@@ -417,143 +427,151 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[4]
 ; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.u16 r2, q3[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s8
-; CHECK-NEXT:    vmov q3, q5
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmovnb.i32 q3, q4
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r3
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vmov.f32 s22, s12
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmovnb.i32 q6, q4
+; CHECK-NEXT:    vmov r2, s26
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov r2, s23
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
 ; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.u16 r2, q2[4]
 ; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
 ; CHECK-NEXT:    vmov.16 q4[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.16 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
 ; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
 ; CHECK-NEXT:    vmov.16 q5[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
 ; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
 ; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r3
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.16 q1[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q1[7], r2
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vadd.i16 q0, q4, q1
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
-; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
-; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vmov.f32 s30, s16
+; CHECK-NEXT:    vmovnb.i32 q2, q5
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vadd.i16 q1, q4, q3
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s7, s23
-; CHECK-NEXT:    vmov q5, q7
-; CHECK-NEXT:    vmovnb.i32 q5, q6
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r2
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov q5[3], q5[1], r2, r0
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
 ; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.f32 s26, s12
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmovnb.i32 q7, q5
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov r0, s27
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q5[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q5[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q5[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q7[0], r0
+; CHECK-NEXT:    vmov.16 q5[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q7[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q7[2], r0
+; CHECK-NEXT:    vmov.16 q5[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q7[3], r0
+; CHECK-NEXT:    vmov.16 q6[6], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
 ; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov q2, q6
-; CHECK-NEXT:    vmovnb.i32 q2, q7
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov r0, s29
-; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vadd.i16 q1, q1, q2
-; CHECK-NEXT:    vadd.i16 q1, q1, q5
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.16 q6[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s25
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmovnb.i32 q3, q6
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vadd.i16 q1, q5, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, q4
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -607,38 +625,42 @@ entry:
 define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrb.u16 q0, [r0]
-; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    str r0, [sp]
-; CHECK-NEXT:    vmov.u16 r3, q0[0]
-; CHECK-NEXT:    vmov.u16 r4, q0[6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q0[1]
-; CHECK-NEXT:    vmov.u16 r4, q0[7]
-; CHECK-NEXT:    vmov.u16 r12, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r3
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    vmov.u16 lr, q0[2]
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vldrb.u16 q0, [r3]
-; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r2
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, lr
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r0, r12
-; CHECK-NEXT:    vadd.i32 q0, q1, q2
+; CHECK-NEXT:    vldrb.u16 q2, [r0]
+; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    str r3, [sp]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vldrb.u16 q2, [r2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vadd.i32 q1, q1, q3
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x i8>, <12 x i8>* %src, align 4
   %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -721,38 +743,41 @@ entry:
 define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.u8 r2, q0[8]
-; CHECK-NEXT:    vmov.8 q4[8], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    vmov.8 q4[9], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    vmov.8 q4[10], r2
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vmov.u8 r2, q2[0]
-; CHECK-NEXT:    vmov.8 q3[0], r2
+; CHECK-NEXT:    vmov.8 q1[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
-; CHECK-NEXT:    vmov.8 q3[1], r2
+; CHECK-NEXT:    vmov.8 q1[1], r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[6]
-; CHECK-NEXT:    vmov.8 q3[2], r2
+; CHECK-NEXT:    vmov.8 q1[2], r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[9]
-; CHECK-NEXT:    vmov.8 q3[3], r2
+; CHECK-NEXT:    vmov.8 q1[3], r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[12]
-; CHECK-NEXT:    vmov.8 q3[4], r2
+; CHECK-NEXT:    vmov.8 q1[4], r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[15]
-; CHECK-NEXT:    vmov.8 q3[5], r2
+; CHECK-NEXT:    vmov.8 q1[5], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    vmov.8 q3[6], r2
+; CHECK-NEXT:    vmov.8 q1[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[5]
+; CHECK-NEXT:    vmov.8 q1[7], r2
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[8]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.8 q4[8], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[11]
+; CHECK-NEXT:    vmov.8 q4[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[14]
+; CHECK-NEXT:    vmov.8 q4[10], r2
 ; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.8 q3[7], r2
 ; CHECK-NEXT:    vmov.8 q4[11], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.32 q3[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
 ; CHECK-NEXT:    vmov.8 q5[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[4]
@@ -762,51 +787,51 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.8 q5[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[13]
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
 ; CHECK-NEXT:    vmov.8 q5[4], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q6[8], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
 ; CHECK-NEXT:    vmov.8 q5[5], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q6[9], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
 ; CHECK-NEXT:    vmov.8 q5[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q6[10], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[2]
 ; CHECK-NEXT:    vmov.8 q5[7], r0
-; CHECK-NEXT:    vmov.8 q6[11], r2
 ; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    vmov q6[2], q6[0], r2, r0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.8 q5[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.8 q5[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.8 q5[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q5[11], r0
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov.32 q4[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.8 q7[12], r0
+; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.8 q7[13], r0
+; CHECK-NEXT:    vmov.8 q5[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.8 q7[14], r0
+; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.8 q7[15], r0
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r0
-; CHECK-NEXT:    vmov.u8 r2, q1[4]
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.8 q3[12], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[7]
-; CHECK-NEXT:    vmov.8 q3[13], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[10]
-; CHECK-NEXT:    vmov.8 q3[14], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[13]
-; CHECK-NEXT:    vmov.8 q3[15], r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r0
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.8 q5[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.8 q5[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.8 q5[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.8 q5[15], r0
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[2]
-; CHECK-NEXT:    vadd.i8 q3, q4, q6
+; CHECK-NEXT:    vadd.i8 q3, q3, q4
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[5]
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    vmov.8 q4[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[8]
 ; CHECK-NEXT:    vmov.8 q4[2], r0
@@ -815,35 +840,38 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[4], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[8], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
 ; CHECK-NEXT:    vmov.8 q4[5], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[9], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[0]
 ; CHECK-NEXT:    vmov.8 q4[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[10], r2
-; CHECK-NEXT:    vmov.u8 r2, q1[3]
-; CHECK-NEXT:    vmov.8 q2[11], r2
 ; CHECK-NEXT:    vmov.8 q4[7], r0
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.8 q4[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.8 q4[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.8 q4[11], r0
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.8 q2[12], r0
+; CHECK-NEXT:    vmov.8 q0[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.8 q2[13], r0
+; CHECK-NEXT:    vmov.8 q0[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.8 q0[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
-; CHECK-NEXT:    vadd.i8 q0, q3, q0
+; CHECK-NEXT:    vmov.8 q0[15], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vadd.i8 q0, q3, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <48 x i8>, <48 x i8>* %src, align 4
@@ -874,32 +902,34 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s15, s17
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.f64 d0, d4
 ; CHECK-NEXT:    vmov.f32 s1, s9
 ; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov r12, s13
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r12, s15
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    adc.w r2, r2, r12
 ; CHECK-NEXT:    adds.w lr, lr, r0
-; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    adc.w r12, r2, r3
-; CHECK-NEXT:    vmov r3, s15
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r3, s1
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, pc}
@@ -917,89 +947,93 @@ entry:
 define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld3_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #24
 ; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q5, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s9
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f64 d8, d5
-; CHECK-NEXT:    vmov.f32 s17, s11
-; CHECK-NEXT:    vmov.f32 s18, s4
-; CHECK-NEXT:    vmov.f32 s19, s5
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s25, s23
-; CHECK-NEXT:    vmov.f32 s26, s4
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vmov.f32 s27, s5
-; CHECK-NEXT:    vmov.f32 s23, s15
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov r12, s25
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov.f64 d14, d6
-; CHECK-NEXT:    vmov.f32 s29, s13
-; CHECK-NEXT:    vmov.f32 s30, s6
-; CHECK-NEXT:    vmov.f32 s31, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r6, s10
-; CHECK-NEXT:    vmov r7, s26
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f64 d4, d0
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s9, s1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vmov.f64 d14, d11
+; CHECK-NEXT:    vmov.f32 s29, s23
+; CHECK-NEXT:    vmov.f32 s30, s0
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vmov.f32 s23, s27
+; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vmov r3, s30
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vmov.f32 s14, s16
+; CHECK-NEXT:    vmov.f32 s11, s19
+; CHECK-NEXT:    vmov.f32 s15, s17
+; CHECK-NEXT:    vmov.f64 d8, d12
+; CHECK-NEXT:    vmov.f32 s17, s25
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmov r12, s31
+; CHECK-NEXT:    vmov r2, s23
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    adc.w r3, r2, r12
-; CHECK-NEXT:    vmov r2, s29
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    adc.w r2, r2, r12
 ; CHECK-NEXT:    adds.w lr, lr, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    adc.w r12, r3, r2
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    adc.w r12, r2, r3
+; CHECK-NEXT:    vmov r3, s29
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    adds r0, r0, r4
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    adcs r2, r3
 ; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov r2, s9
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    adds r3, r3, r4
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    adcs r0, r2
+; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    adds.w lr, r3, r4
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    adc.w r12, r0, r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    adds r0, r0, r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    adc.w r8, r2, r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s23
-; CHECK-NEXT:    vmov q1[2], q1[0], r5, r0
-; CHECK-NEXT:    vmov r0, s31
-; CHECK-NEXT:    adcs r3, r4
-; CHECK-NEXT:    vmov r4, s27
-; CHECK-NEXT:    adds r2, r2, r7
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r8
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    adcs r4, r6
-; CHECK-NEXT:    vmov r6, s30
-; CHECK-NEXT:    adds r2, r2, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    adcs r0, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    adcs r2, r3
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %l1 = load <12 x i64>, <12 x i64>* %src, align 4
   %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1319,93 +1353,97 @@ entry:
 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vmov r3, s5
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov.16 q0[0], r3
 ; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s12, s9
 ; CHECK-NEXT:    vmov.16 q0[2], r2
 ; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s19
 ; CHECK-NEXT:    vmov.16 q0[3], r2
 ; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s2, s11
 ; CHECK-NEXT:    vmovx.f16 s20, s16
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.16 q3[6], r3
 ; CHECK-NEXT:    vmov.16 q3[7], r0
 ; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmovx.f16 s24, s11
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    vmov.f32 s14, s16
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov.16 q5[5], r0
 ; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov r12, s22
+; CHECK-NEXT:    vmov lr, s22
 ; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[6], r3
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r12, s23
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[4], r3
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r3, s22
+; CHECK-NEXT:    vmovx.f16 s20, s18
 ; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q4[6], r2
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r2, s22
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q5[0], r5
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s18
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q6[0], r2
 ; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.16 q2[6], r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov.16 q2[7], r5
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
-; CHECK-NEXT:    vmov q1[2], q1[0], r12, r4
+; CHECK-NEXT:    vmov.16 q6[1], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    vmov.16 q6[2], r0
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.32 q1[1], r4
+; CHECK-NEXT:    vmov r0, s25
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q1[2], lr
+; CHECK-NEXT:    vmov.32 q0[2], r3
 ; CHECK-NEXT:    vmov r4, s15
-; CHECK-NEXT:    vmov.f32 s23, s11
-; CHECK-NEXT:    vmov q0[3], q0[1], lr, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r5
+; CHECK-NEXT:    vmov.f32 s23, s19
+; CHECK-NEXT:    vmov.32 q0[3], r12
+; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vadd.f16 q0, q5, q0
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
   %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1420,174 +1458,182 @@ entry:
 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld3_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmovx.f16 s20, s17
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmovx.f16 s20, s13
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov r12, s8
-; CHECK-NEXT:    vmovx.f16 s8, s14
-; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov r12, s4
+; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    vmov r2, s9
 ; CHECK-NEXT:    vmov.16 q0[7], r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.16 q2[1], r3
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.16 q1[1], r3
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q1[2], r2
 ; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov.f32 s10, s19
-; CHECK-NEXT:    vmovx.f16 s24, s13
-; CHECK-NEXT:    vmovx.f16 s28, s16
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov.16 q5[4], r2
 ; CHECK-NEXT:    vmov.16 q5[5], r12
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmovx.f16 s20, s6
+; CHECK-NEXT:    vmov lr, s22
+; CHECK-NEXT:    vmovx.f16 s20, s14
 ; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov r2, s19
+; CHECK-NEXT:    vmov.16 q5[5], r3
+; CHECK-NEXT:    vmov r12, s22
+; CHECK-NEXT:    vmovx.f16 s20, s17
+; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmov.16 q5[6], r3
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov.16 q4[6], r3
+; CHECK-NEXT:    vmov r2, s23
+; CHECK-NEXT:    vmov.16 q4[7], r4
+; CHECK-NEXT:    vmovx.f16 s20, s9
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov.16 q5[0], r4
+; CHECK-NEXT:    vmov.16 q5[1], r3
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    vmov.16 q5[2], r3
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov.16 q5[3], r3
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmovx.f16 s24, s15
+; CHECK-NEXT:    vmov.16 q5[4], r3
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.16 q5[5], r3
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    vmov.16 q6[0], r3
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov.16 q6[3], r2
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmovx.f16 s28, s19
-; CHECK-NEXT:    vmov.16 q6[4], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmovx.f16 s28, s18
-; CHECK-NEXT:    vmov.16 q6[5], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov.16 q7[5], r3
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov lr, s30
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vmovx.f16 s12, s15
-; CHECK-NEXT:    vmov.16 q7[0], r3
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q7[1], r2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov.16 q7[2], r2
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.16 q1[6], r5
-; CHECK-NEXT:    vmov.16 q7[3], r2
-; CHECK-NEXT:    vmov.16 q1[7], r4
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], lr, r2
-; CHECK-NEXT:    vmov r6, s29
-; CHECK-NEXT:    vmov r4, s7
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
-; CHECK-NEXT:    vmov q1[2], q1[0], r12, r5
-; CHECK-NEXT:    vmov r5, s9
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r5
-; CHECK-NEXT:    vadd.f16 q0, q6, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmov.16 q6[1], r4
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov.32 q2[0], r5
+; CHECK-NEXT:    vmov.16 q6[2], r3
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov.16 q6[3], r3
+; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov.32 q2[1], r5
+; CHECK-NEXT:    vmov r5, s3
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    vmov.32 q2[2], lr
+; CHECK-NEXT:    vmov.32 q0[1], r4
+; CHECK-NEXT:    vmov.f32 s23, s19
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmov.32 q2[3], r5
+; CHECK-NEXT:    vadd.f16 q0, q5, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmovx.f16 s12, s16
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmovx.f16 s16, s4
 ; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmovx.f16 s16, s14
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmovx.f16 s24, s13
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.16 q4[1], r3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q4[2], r2
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.16 q3[1], r3
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.16 q3[2], r2
 ; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmovx.f16 s28, s8
-; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    vmov r2, s22
 ; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[6], r2
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q5[5], r3
+; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r12, s22
+; CHECK-NEXT:    vmovx.f16 s20, s17
+; CHECK-NEXT:    vmov r5, s20
+; CHECK-NEXT:    vmov.16 q5[6], r5
+; CHECK-NEXT:    vmov r5, s17
 ; CHECK-NEXT:    vmov.16 q5[7], r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q6[0], r3
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov.16 q6[3], r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmovx.f16 s28, s11
-; CHECK-NEXT:    vmov.16 q6[4], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmovx.f16 s28, s10
-; CHECK-NEXT:    vmov.16 q6[5], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov.16 q7[5], r3
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    vmov r2, s30
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vmov r6, s28
-; CHECK-NEXT:    vmovx.f16 s12, s15
-; CHECK-NEXT:    vmov.16 q7[0], r6
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q7[1], r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov.16 q7[2], r3
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov.16 q1[6], r4
-; CHECK-NEXT:    vmov.16 q7[3], r3
-; CHECK-NEXT:    vmov.16 q1[7], r5
-; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov r5, s7
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r4
-; CHECK-NEXT:    vmov r4, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    vmov r6, s29
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r0
-; CHECK-NEXT:    vadd.f16 q0, q6, q0
+; CHECK-NEXT:    vmov.16 q4[6], r5
+; CHECK-NEXT:    vmov r3, s23
+; CHECK-NEXT:    vmov.16 q4[7], r4
+; CHECK-NEXT:    vmovx.f16 s20, s9
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r5, s20
+; CHECK-NEXT:    vmov.16 q5[0], r4
+; CHECK-NEXT:    vmov.16 q5[1], r5
+; CHECK-NEXT:    vmov r5, s11
+; CHECK-NEXT:    vmov.16 q5[2], r5
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vmov.16 q5[3], r5
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmovx.f16 s24, s7
+; CHECK-NEXT:    vmov.16 q5[4], r5
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.16 q5[5], r5
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vmov r4, s10
+; CHECK-NEXT:    vmov.16 q6[0], r5
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmov.16 q6[1], r4
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov.16 q6[2], r5
+; CHECK-NEXT:    vmov r5, s5
+; CHECK-NEXT:    vmov.16 q6[3], r5
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.32 q0[0], r5
+; CHECK-NEXT:    vmov r4, s25
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov.32 q0[1], r4
+; CHECK-NEXT:    vmov.f32 s23, s19
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    vadd.f16 q0, q5, q0
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4
   %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index 548a188e3239..e8f336871326 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -119,41 +119,43 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r12, s17
-; CHECK-NEXT:    vmov lr, s13
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    vmov r12, s19
+; CHECK-NEXT:    vmov lr, s15
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vmov r5, s2
+; CHECK-NEXT:    vmov r7, s0
 ; CHECK-NEXT:    adds r6, r3, r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    adc.w r12, r12, lr
 ; CHECK-NEXT:    adds r5, r5, r4
-; CHECK-NEXT:    vmov r4, s18
+; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    adcs r2, r3
 ; CHECK-NEXT:    adds.w lr, r5, r6
 ; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r6, s19
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r6, s17
+; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r4, s1
 ; CHECK-NEXT:    adcs r6, r5
-; CHECK-NEXT:    vmov r5, s7
+; CHECK-NEXT:    vmov r5, s5
 ; CHECK-NEXT:    adds r3, r3, r7
 ; CHECK-NEXT:    adcs r4, r5
 ; CHECK-NEXT:    adds r2, r2, r3
 ; CHECK-NEXT:    adc.w r3, r4, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index cb49257ce0bd..776598d06db7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -229,33 +229,41 @@ define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
+; CHECK-NEXT:    vmov.32 q3[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vadd.i32 q2, q3, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q4[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
+; CHECK-NEXT:    vmov.32 q4[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vadd.i32 q0, q3, q4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
@@ -382,12 +390,14 @@ define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
 ; CHECK-NEXT:    vrev32.8 q2, q0
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
 ; CHECK-NEXT:    vrev16.8 q2, q0
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
@@ -545,42 +555,44 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s19, s23
 ; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r12, s17
-; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov r12, s19
+; CHECK-NEXT:    vmov r2, s15
 ; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r6, s2
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov r6, s0
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    vmov r4, s15
+; CHECK-NEXT:    vmov r4, s13
 ; CHECK-NEXT:    adcs r0, r3
 ; CHECK-NEXT:    adds.w lr, lr, r2
 ; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s19
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r3, s17
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r2, r4, r3
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vmov r4, s1
 ; CHECK-NEXT:    adds r5, r5, r6
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r5
 ; CHECK-NEXT:    adcs r2, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -600,118 +612,123 @@ entry:
 define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vld4_v4i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #72
+; CHECK-NEXT:    sub sp, #72
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d8, d3
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT:    vmov.f64 d14, d9
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d12, d11
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vmov.f64 d6, d3
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vmov.f32 s14, s2
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s29, s19
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vmov.f64 d4, d13
-; CHECK-NEXT:    vmov.f32 s31, s3
+; CHECK-NEXT:    vmov.f64 d4, d15
+; CHECK-NEXT:    vmov.f32 s15, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s9, s27
+; CHECK-NEXT:    vmov.f32 s9, s31
 ; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s26, s0
+; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s27, s1
+; CHECK-NEXT:    vmov.f32 s31, s1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov r12, s9
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov.f64 d10, d7
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov.f32 s18, s0
-; CHECK-NEXT:    vmov.f32 s19, s1
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov r4, s6
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov r12, s11
+; CHECK-NEXT:    vmov r2, s31
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov.f32 s23, s1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r7, s16
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    adc.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vmov r6, s2
 ; CHECK-NEXT:    adds r3, r3, r4
-; CHECK-NEXT:    vmov r4, s30
+; CHECK-NEXT:    vmov r4, s23
 ; CHECK-NEXT:    adcs r0, r2
 ; CHECK-NEXT:    adds.w lr, lr, r3
-; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    adc.w r12, r12, r0
-; CHECK-NEXT:    vmov r0, s31
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov r2, s22
+; CHECK-NEXT:    vmov r3, s27
+; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    adc.w r2, r4, r3
 ; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    adds r2, r2, r4
-; CHECK-NEXT:    adcs r3, r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov r6, s2
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r7, s4
 ; CHECK-NEXT:    vmov r4, s3
 ; CHECK-NEXT:    adds r5, r5, r6
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    adcs r4, r0
-; CHECK-NEXT:    adds.w r9, r5, r2
-; CHECK-NEXT:    vmov r5, s28
-; CHECK-NEXT:    adc.w r8, r4, r3
-; CHECK-NEXT:    vmov r2, s29
-; CHECK-NEXT:    vmov r4, s17
+; CHECK-NEXT:    vmov r6, s20
+; CHECK-NEXT:    adcs r3, r4
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    adc.w r8, r3, r2
+; CHECK-NEXT:    vmov r2, s25
+; CHECK-NEXT:    vmov r4, s21
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    adds r5, r5, r6
 ; CHECK-NEXT:    vmov r6, s1
 ; CHECK-NEXT:    adcs r2, r4
-; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    vmov r4, s17
 ; CHECK-NEXT:    adds r3, r3, r7
-; CHECK-NEXT:    vmov r7, s14
+; CHECK-NEXT:    vmov r7, s28
 ; CHECK-NEXT:    adcs r4, r6
 ; CHECK-NEXT:    adds r3, r3, r5
-; CHECK-NEXT:    vmov r6, s22
-; CHECK-NEXT:    adc.w r10, r4, r2
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    vmov q1[2], q1[0], r9, r3
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov q1[3], q1[1], r8, r10
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov r4, s9
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov r5, s29
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r3, s13
+; CHECK-NEXT:    vmov.32 q0[3], r8
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    adds r6, r6, r7
-; CHECK-NEXT:    vmov r7, s27
 ; CHECK-NEXT:    adcs r4, r5
-; CHECK-NEXT:    vmov r5, s11
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r7, r5
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    adds r0, r0, r5
+; CHECK-NEXT:    adcs r2, r3
 ; CHECK-NEXT:    adds r0, r0, r6
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
-; CHECK-NEXT:    adc.w r0, r4, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    adcs r2, r4
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], lr
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #72
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <16 x i64>, <16 x i64>* %src, align 4
   %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 0ea0bd3d2711..279665604967 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -6,11 +6,13 @@ define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q1
 ; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    asrs r0, r0, #31
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <2 x i32> %s0 to <2 x i64>
@@ -46,28 +48,28 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
 ; CHECK-LABEL: vmulhs_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vmov.f32 s12, s0
-; CHECK-NEXT:    vmov.f32 s10, s5
 ; CHECK-NEXT:    vmov.f32 s14, s1
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmullb.s32 q5, q1, q4
+; CHECK-NEXT:    vmullb.s32 q0, q1, q3
 ; CHECK-NEXT:    smmul r0, r1, r0
-; CHECK-NEXT:    vmov r1, s21
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    smmul r0, r1, r0
-; CHECK-NEXT:    vmov r1, s23
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    smmul r1, r2, r1
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -140,18 +142,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhs_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmullb.s16 q2, q3, q2
 ; CHECK-NEXT:    vshr.s32 q3, q2, #16
 ; CHECK-NEXT:    vmov r0, s12
@@ -163,16 +168,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmullb.s16 q0, q1, q3
 ; CHECK-NEXT:    vshr.s32 q0, q0, #16
 ; CHECK-NEXT:    vmov r0, s0
@@ -198,18 +208,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhu_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmullb.u16 q2, q3, q2
 ; CHECK-NEXT:    vshr.u32 q3, q2, #16
 ; CHECK-NEXT:    vmov r0, s12
@@ -221,16 +234,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmullb.u16 q0, q1, q3
 ; CHECK-NEXT:    vshr.u32 q0, q0, #16
 ; CHECK-NEXT:    vmov r0, s0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
index d17e4f05a44c..ecb119888c86 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
@@ -4,33 +4,38 @@
 define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) {
 ; CHECK-LABEL: test32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
 ; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    blt .LBB0_2
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    poplt {r5, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmullt.s32 q0, q2, q1
-; CHECK-NEXT:    vmullb.s32 q3, q2, q1
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    vmov r7, s3
+; CHECK-NEXT:    vmullt.s32 q3, q2, q1
+; CHECK-NEXT:    vmov r5, s13
+; CHECK-NEXT:    vmov r12, s12
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    lsrl r4, r7, #31
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r12
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmullb.s32 q3, q2, q1
+; CHECK-NEXT:    vmov.32 q0[2], r12
 ; CHECK-NEXT:    vmov r12, s12
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r5
+; CHECK-NEXT:    vmov.32 q0[3], r5
 ; CHECK-NEXT:    vmov r5, s13
-; CHECK-NEXT:    vmov r7, s15
 ; CHECK-NEXT:    lsrl r12, r5, #31
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    lsrl r4, r7, #31
-; CHECK-NEXT:    vmov q1[2], q1[0], r4, r12
-; CHECK-NEXT:    vmov q1[3], q1[1], r7, r5
+; CHECK-NEXT:    vmov.32 q1[0], r12
+; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    vmov r5, s15
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], r5
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s6, s0
@@ -42,8 +47,8 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa
 ; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    bne .LBB0_1
-; CHECK-NEXT:  .LBB0_2: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r5, pc}
 entry:
   %0 = and i32 %n, 3
   %cmp = icmp eq i32 %0, 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index 0a96bd653c3c..995ac7d88fde 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -74,18 +74,21 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.u16 r1, q2[3]
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmullb.s16 q0, q3, q0
 ; CHECK-NEXT:    vmov.i32 q3, #0x7fff
 ; CHECK-NEXT:    vshl.i32 q0, q0, #10
@@ -101,16 +104,21 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    vmov.16 q0[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov.32 q4[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.u16 r1, q2[7]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.32 q2[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmullb.s16 q1, q2, q4
 ; CHECK-NEXT:    vshl.i32 q1, q1, #10
 ; CHECK-NEXT:    vshr.s32 q1, q1, #10

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
index 22faf84b6fed..b8304cf82bea 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
@@ -165,28 +165,30 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    mvn r3, #-2147483648
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    subs.w r2, r2, r12
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    subs r2, r2, r3
 ; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs.w r3, r3, r12
-; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    sbcs r2, r2, #0
-; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    subs r2, r2, r3
+; CHECK-NEXT:    mov.w r3, #-1
+; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    adr r1, .LCPI12_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -194,23 +196,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) {
 ; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    sbcs.w r1, r12, r1
+; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r2, r12, r2
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r3, r1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    adr r0, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -241,28 +245,30 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-LABEL: vqmovni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w r12, #-1
+; CHECK-NEXT:    mov.w r3, #-1
 ; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    sbcs.w r1, r12, r1
+; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r2, r12, r2
-; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    rsbs.w r2, r2, #-2147483648
+; CHECK-NEXT:    sbcs.w r1, r3, r1
+; CHECK-NEXT:    mvn r3, #-2147483648
+; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    adr r1, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -270,23 +276,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) {
 ; CHECK-NEXT:    vorr q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    subs.w r2, r2, r12
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    subs r2, r2, r3
 ; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #1
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    subs.w r3, r3, r12
-; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    subs r2, r2, r3
+; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    adr r0, .LCPI13_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -320,23 +328,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) {
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs.w r3, r3, #-1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    sbcs r0, r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
@@ -354,23 +364,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) {
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    subs.w r1, r1, #-1
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs.w r3, r3, #-1
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    subs.w r1, r1, #-1
+; CHECK-NEXT:    sbcs r0, r0, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vorr q0, q0, q2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
index 39d9cf26f76e..c2210925bfe1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
@@ -180,65 +180,71 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_smaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    mvn r12, #-2147483648
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    mvn lr, #-2147483648
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w lr, #0
-; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    subs.w r3, r2, r12
-; CHECK-NEXT:    sbcs r3, r1, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    asrl r2, r5, #3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    asrl r4, r3, #3
-; CHECK-NEXT:    subs.w r0, r4, r12
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    sbcs r0, r3, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
+; CHECK-NEXT:    subs.w r0, r2, lr
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    sbcs r0, r5, #0
+; CHECK-NEXT:    vmov.32 q2[1], r5
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r5
-; CHECK-NEXT:    adr r0, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    asrl r0, r3, #3
+; CHECK-NEXT:    subs.w r1, r0, lr
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    adr r1, .LCPI12_0
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sbcs.w r0, r2, r0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    rsbs.w r3, r3, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r2, r1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
+; CHECK-NEXT:    sbcs.w r0, r2, r0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w lr, #1
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    adr r0, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r5, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI12_0:
@@ -263,65 +269,71 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    mov.w r12, #-1
+; CHECK-NEXT:    mov.w lr, #-1
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    mov.w lr, #0
+; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    rsbs.w r3, r2, #-2147483648
-; CHECK-NEXT:    sbcs.w r3, r12, r1
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    asrl r4, r3, #3
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    sbcs.w r5, r12, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r1
-; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    rsbs.w r0, r2, #-2147483648
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    sbcs.w r0, lr, r1
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #1
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r0
-; CHECK-NEXT:    adr r0, .LCPI13_0
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    asrl r0, r3, #3
+; CHECK-NEXT:    rsbs.w r4, r0, #-2147483648
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    sbcs.w r4, lr, r3
+; CHECK-NEXT:    vmov.32 q2[3], r3
+; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csetm r4, ne
+; CHECK-NEXT:    vmov.32 q1[2], r4
+; CHECK-NEXT:    vmov.32 q1[3], r4
+; CHECK-NEXT:    adr r4, .LCPI13_0
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    subs r1, r1, r2
-; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    sbcs r0, r0, #0
+; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    mov.w r0, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    subs r1, r1, r2
+; CHECK-NEXT:    sbcs r0, r0, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w lr, #1
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    adr r0, .LCPI13_1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    vbic q2, q2, q1
 ; CHECK-NEXT:    vorr q0, q0, q2
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI13_0:
@@ -346,37 +358,41 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_umaxmin(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_umaxmin:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    .save {r5, r6, r7, lr}
+; CHECK-NEXT:    push {r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r7, s1
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
-; CHECK-NEXT:    lsrl r0, r5, #3
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    subs.w r3, r0, #-1
-; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r3, #1
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    lsrl r0, r7, #3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    csetm r12, ne
-; CHECK-NEXT:    lsrl r4, r3, #3
-; CHECK-NEXT:    subs.w r1, r4, #-1
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
-; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    subs.w r2, r0, #-1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    sbcs r2, r7, #0
+; CHECK-NEXT:    vmov.32 q2[1], r7
+; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    lsrl r2, r3, #3
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
+; CHECK-NEXT:    subs.w r5, r2, #-1
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sbcs r5, r3, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    pop {r5, r6, r7, pc}
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c1 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>
@@ -387,37 +403,41 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_uminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_uminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r5, s1
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    .save {r5, r6, r7, lr}
+; CHECK-NEXT:    push {r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r7, s1
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
-; CHECK-NEXT:    lsrl r0, r5, #3
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    subs.w r3, r0, #-1
-; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r3, #1
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    lsrl r0, r7, #3
 ; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    csetm r12, ne
-; CHECK-NEXT:    lsrl r4, r3, #3
-; CHECK-NEXT:    subs.w r1, r4, #-1
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r0
-; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    subs.w r2, r0, #-1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    sbcs r2, r7, #0
+; CHECK-NEXT:    vmov.32 q2[1], r7
+; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r3, r5
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    lsrl r2, r3, #3
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
+; CHECK-NEXT:    subs.w r5, r2, #-1
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    sbcs r5, r3, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q2[3], r3
 ; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r12
-; CHECK-NEXT:    vbic q1, q1, q0
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    pop {r5, r6, r7, pc}
 entry:
   %s0 = lshr <2 x i64> %so, <i64 3, i64 3>
   %c2 = icmp ult <2 x i64> %s0, <i64 4294967295, i64 4294967295>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 2ee7bbe8cf7d..b815ed24ae26 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -6,19 +6,21 @@
 define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldrd r12, r3, [r0]
-; CHECK-NEXT:    ldrd r2, r0, [r0, #8]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r12
-; CHECK-NEXT:    vmov.f64 d2, d1
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r2
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    ldrd r2, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.f32 s9, s3
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -111,12 +113,14 @@ entry:
 define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v2i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldrh r2, [r0]
-; CHECK-NEXT:    ldrh r3, [r0, #2]
-; CHECK-NEXT:    ldrh.w r12, [r0, #4]
-; CHECK-NEXT:    ldrh r0, [r0, #6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    ldrh r3, [r0]
+; CHECK-NEXT:    ldrh r2, [r0, #4]
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    ldrh.w r12, [r0, #6]
+; CHECK-NEXT:    ldrh r0, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q0[3], r12
 ; CHECK-NEXT:    vstrh.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -193,11 +197,13 @@ define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) {
 ; CHECK-LABEL: vst2_v2i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    ldrb r3, [r0, #1]
-; CHECK-NEXT:    ldrb.w r12, [r0, #2]
+; CHECK-NEXT:    ldrb r3, [r0, #2]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrb.w r12, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    ldrb r0, [r0, #3]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vstrb.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index a3fe34927f84..1ae74c1738c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -8,13 +8,14 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, lr
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
 ; CHECK-NEXT:    vmov.32 q0[0], r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
+; CHECK-NEXT:    vmov.32 q1[2], r12
 ; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q1[3], lr
 ; CHECK-NEXT:    vmov.f32 s8, s7
 ; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov r2, s8
@@ -301,16 +302,18 @@ define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) {
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrh r2, [r0, #6]
-; CHECK-NEXT:    ldrh.w lr, [r0, #4]
+; CHECK-NEXT:    ldrh r3, [r0, #4]
 ; CHECK-NEXT:    ldrh.w r12, [r0, #8]
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    ldrh r3, [r0, #2]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
+; CHECK-NEXT:    ldrh.w lr, [r0, #2]
+; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    vmov.32 q1[2], r2
 ; CHECK-NEXT:    ldrh r0, [r0, #10]
 ; CHECK-NEXT:    vmov.16 q0[5], r0
 ; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r4
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    vmov.32 q0[2], lr
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.f32 s3, s2
@@ -686,8 +689,9 @@ define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
 ; CHECK-NEXT:    ldrb r2, [r0]
 ; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    ldrb r3, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r3
 ; CHECK-NEXT:    ldrb.w lr, [r0, #3]
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    ldrb r5, [r0, #5]
@@ -1457,21 +1461,23 @@ entry:
 define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-LABEL: vst3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
-; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
-; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q2[0], r3
 ; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov.16 q2[2], r0
 ; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s2
@@ -1480,7 +1486,6 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s12, s4
 ; CHECK-NEXT:    vmov.16 q2[4], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vmov.16 q2[5], r0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vmov.16 q2[6], r0
@@ -1500,7 +1505,7 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmov r0, s9
 ; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
   %l1 = load <4 x half>, <4 x half>* %s1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 0f089cb8769d..bc5415566424 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -8,16 +8,18 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r4
+; CHECK-NEXT:    vmov.32 q1[0], r4
+; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vmov.f64 d0, d2
 ; CHECK-NEXT:    vmov.f32 s1, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, lr
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r3
+; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], lr
 ; CHECK-NEXT:    vmov.f64 d4, d2
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
@@ -207,22 +209,23 @@ define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrh r2, [r0]
-; CHECK-NEXT:    ldrh.w r12, [r0, #4]
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    ldrh.w lr, [r0, #4]
 ; CHECK-NEXT:    ldrh r3, [r0, #8]
-; CHECK-NEXT:    ldrh.w lr, [r0, #6]
-; CHECK-NEXT:    ldrh r4, [r0, #10]
+; CHECK-NEXT:    vmov.32 q0[0], r4
+; CHECK-NEXT:    ldrh.w r12, [r0, #6]
+; CHECK-NEXT:    ldrh r2, [r0, #10]
 ; CHECK-NEXT:    ldrh r0, [r0, #2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.16 q0[1], r12
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    vmov.16 q0[1], lr
 ; CHECK-NEXT:    vmov.16 q0[2], r3
 ; CHECK-NEXT:    vmov.16 q0[3], r3
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.16 q0[5], lr
-; CHECK-NEXT:    vmov.16 q0[6], r4
-; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmov.16 q0[5], r12
+; CHECK-NEXT:    vmov.16 q0[6], r2
+; CHECK-NEXT:    vmov.16 q0[7], r2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -373,8 +376,9 @@ define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    ldrb r2, [r0]
 ; CHECK-NEXT:    ldrb r3, [r0, #1]
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    ldrb.w r12, [r0, #2]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q0[2], r3
 ; CHECK-NEXT:    ldrb.w lr, [r0, #3]
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ldrb r4, [r0, #5]
@@ -907,58 +911,61 @@ entry:
 define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst4_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
-; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, lr
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r12
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r0
-; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    ldm.w r0, {r2, r3, r12, lr}
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q0[1], r3
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov.32 q0[3], lr
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    vmov.16 q2[0], r3
 ; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.32 q1[2], r2
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s12, s3
 ; CHECK-NEXT:    vmov.16 q2[4], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    vmov.16 q2[5], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
-; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmovx.f16 s12, s4
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s0, s4
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
 ; CHECK-NEXT:    vmov.16 q2[6], r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.16 q2[7], r0
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
   %l1 = load <4 x half>, <4 x half>* %s1, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
index 6fb2e44ebf6a..6d7fb9e59c03 100644
--- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll
@@ -276,36 +276,44 @@ define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i3
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
 ; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.u16 r3, q1[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.u16 r3, q1[7]
-; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
 ; CHECK-NEXT:    vmovlb.u16 q2, q0
 ; CHECK-NEXT:    vldrb.s16 q0, [r1]
 ; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vmov.32 q2[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    vmovlb.u16 q1, q2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    vmovlb.u16 q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr


        


More information about the llvm-branch-commits mailing list