[llvm] 52e0cf9 - [ARM] Enable subreg liveness

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 17 06:10:44 PDT 2021


Author: David Green
Date: 2021-08-17T14:10:33+01:00
New Revision: 52e0cf9d61618353d2745a51a16ae408edf0f49b

URL: https://github.com/llvm/llvm-project/commit/52e0cf9d61618353d2745a51a16ae408edf0f49b
DIFF: https://github.com/llvm/llvm-project/commit/52e0cf9d61618353d2745a51a16ae408edf0f49b.diff

LOG: [ARM] Enable subreg liveness

This enables subreg liveness in the arm backend when MVE is present,
which allows the register allocator to detect when subregister are
alive/dead, compared to only acting on full registers. This can helps
produce better code on MVE with the way MQPR registers are made up of
SPR registers, but is especially helpful for MQQPR and MQQQQPR
registers, where there are very few "registers" available and being able
to split them up into subregs can help produce much better code.

Differential Revision: https://reviews.llvm.org/D107642

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMSubtarget.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
    llvm/test/CodeGen/Thumb2/active_lane_mask.ll
    llvm/test/CodeGen/Thumb2/mve-be.ll
    llvm/test/CodeGen/Thumb2/mve-ctlz.ll
    llvm/test/CodeGen/Thumb2/mve-ctpop.ll
    llvm/test/CodeGen/Thumb2/mve-cttz.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-fmas.ll
    llvm/test/CodeGen/Thumb2/mve-fmath.ll
    llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
    llvm/test/CodeGen/Thumb2/mve-frint.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-masked-store.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-nofloat.ll
    llvm/test/CodeGen/Thumb2/mve-phireg.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
    llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
    llvm/test/CodeGen/Thumb2/mve-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
    llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
    llvm/test/CodeGen/Thumb2/mve-vdup.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vhadd.ll
    llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
    llvm/test/CodeGen/Thumb2/mve-vldst4.ll
    llvm/test/CodeGen/Thumb2/mve-vmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
    llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll
    llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 90f1b693fec60..2e5bbb66604dd 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -389,7 +389,13 @@ bool ARMSubtarget::enableMachineScheduler() const {
   return useMachineScheduler();
 }
 
-bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; }
+bool ARMSubtarget::enableSubRegLiveness() const {
+  if (EnableSubRegLiveness.getNumOccurrences())
+    return EnableSubRegLiveness;
+  // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs
+  // and q subregs for qqqqpr regs.
+  return hasMVEIntegerOps();
+}
 
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index f101dd4fcec93..603f667d0c615 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -227,11 +227,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vadd.f32 q0, q0, r0
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    vldr s0, .LCPI1_0
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.5:
@@ -280,7 +278,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-LABEL: fast_float_half_mac:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB2_20
@@ -303,13 +301,13 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vmul.f16 q5, q6, q5
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vcvtt.f32.f16 s27, s21
+; CHECK-NEXT:    vcvtt.f32.f16 s23, s21
+; CHECK-NEXT:    vcvtb.f32.f16 s22, s21
+; CHECK-NEXT:    vcvtt.f32.f16 s21, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s20
 ; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s21
 ; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    vcvtt.f32.f16 s25, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s20
-; CHECK-NEXT:    vadd.f32 q5, q3, q6
+; CHECK-NEXT:    vadd.f32 q5, q3, q5
 ; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    bne .LBB2_3
 ; CHECK-NEXT:    b .LBB2_19
@@ -349,8 +347,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_8
 ; CHECK-NEXT:  .LBB2_7: @ %cond.load12
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r0, #6]
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vldr.16 s22, [r0, #6]
+; CHECK-NEXT:    vins.f16 s21, s22
 ; CHECK-NEXT:  .LBB2_8: @ %else13
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vcmp.u32 cs, q2, q4
@@ -391,15 +389,15 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_5
 ; CHECK-NEXT:  .LBB2_13: @ %cond.load6
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r0, #2]
-; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vldr.16 s22, [r0, #2]
+; CHECK-NEXT:    vins.f16 s20, s22
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_6
 ; CHECK-NEXT:  .LBB2_14: @ %cond.load9
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s24, s21
 ; CHECK-NEXT:    vldr.16 s21, [r0, #4]
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s22, s0
+; CHECK-NEXT:    vins.f16 s21, s22
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bmi .LBB2_7
 ; CHECK-NEXT:    b .LBB2_8
@@ -410,21 +408,21 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_10
 ; CHECK-NEXT:  .LBB2_16: @ %cond.load19
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s28, [r1, #2]
-; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vldr.16 s26, [r1, #2]
+; CHECK-NEXT:    vins.f16 s24, s26
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_11
 ; CHECK-NEXT:  .LBB2_17: @ %cond.load22
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s28, s25
 ; CHECK-NEXT:    vldr.16 s25, [r1, #4]
-; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vmovx.f16 s26, s0
+; CHECK-NEXT:    vins.f16 s25, s26
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bpl.w .LBB2_2
 ; CHECK-NEXT:  .LBB2_18: @ %cond.load25
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s28, [r1, #6]
-; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vldr.16 s26, [r1, #6]
+; CHECK-NEXT:    vins.f16 s25, s26
 ; CHECK-NEXT:    b .LBB2_2
 ; CHECK-NEXT:  .LBB2_19: @ %middle.block
 ; CHECK-NEXT:    vdup.32 q0, r12
@@ -439,9 +437,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:  .LBB2_20:
 ; CHECK-NEXT:    vldr s0, .LCPI2_0
 ; CHECK-NEXT:  .LBB2_21: @ %for.cond.cleanup
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.22:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
index fddbfa8b66207..856e150e6012d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
@@ -14,8 +14,8 @@ define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {
 ; CHECK-NEXT:    vmvn.i32 q1, #0x1f
 ; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vadd.i32 q1, q3, q1
 ; CHECK-NEXT:    subs r3, r1, #1
+; CHECK-NEXT:    vadd.i32 q1, q3, q1
 ; CHECK-NEXT:    vidup.u32 q2, r2, #8
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vadd.i32 q1, q2, r0

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
index 8c23b09e650bf..9162d4a3f2142 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -15,10 +15,10 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
-; CHECK-NEXT:    vmov s4, r1
-; CHECK-NEXT:    vadd.f32 s0, s3, s3
-; CHECK-NEXT:    vcvt.f32.u32 s4, s4
-; CHECK-NEXT:    vdiv.f32 s0, s0, s4
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vadd.f32 s2, s3, s3
+; CHECK-NEXT:    vcvt.f32.u32 s0, s0
+; CHECK-NEXT:    vdiv.f32 s0, s2, s0
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r1

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 2974db0d816b9..f1b3014e358a0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -986,11 +986,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vmul.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB5_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1100,11 +1100,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB6_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1214,11 +1214,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vsub.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB7_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1333,11 +1333,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    vmov.16 q0[3], r8
 ; CHECK-NEXT:    vcvt.f16.s16 q0, q0
 ; CHECK-NEXT:    vmul.f16 q0, q1, q0
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB8_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index 024857b658023..b01a0cc047c29 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -240,11 +240,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vrintr.f32 s7, s3
-; CHECK-NEXT:    vrintr.f32 s6, s2
-; CHECK-NEXT:    vrintr.f32 s5, s1
-; CHECK-NEXT:    vrintr.f32 s4, s0
-; CHECK-NEXT:    vstrw.32 q1, [r1], #16
+; CHECK-NEXT:    vrintr.f32 s3, s3
+; CHECK-NEXT:    vrintr.f32 s2, s2
+; CHECK-NEXT:    vrintr.f32 s1, s1
+; CHECK-NEXT:    vrintr.f32 s0, s0
+; CHECK-NEXT:    vstrw.32 q0, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index d5a44e41e77f9..29174b44cd45a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -11,9 +11,9 @@ define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSiz
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmulh.s16 q2, q1, q1
+; CHECK-NEXT:    vmulh.s16 q1, q1, q1
 ; CHECK-NEXT:    vmulh.s16 q0, q0, q0
-; CHECK-NEXT:    vqadd.s16 q0, q0, q2
+; CHECK-NEXT:    vqadd.s16 q0, q0, q1
 ; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    vstrh.16 q0, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1

diff  --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 98f00707df37c..607a55b52370b 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -14,9 +14,9 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) {
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q1, q0
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vldr d1, [sp]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
@@ -156,8 +156,8 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.i16 ne, q0, zr
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vldr d1, [sp, #48]
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
@@ -339,12 +339,12 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.8 q3[14], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.8 q3[15], r0
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    add r0, sp, #88
 ; CHECK-NEXT:    vcmp.i8 ne, q3, zr
 ; CHECK-NEXT:    vldr d1, [sp, #80]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.i8 ne, q2, zr
 ; CHECK-NEXT:    vpsel q0, q0, q1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll
index d941036488ba6..534530d772418 100644
--- a/llvm/test/CodeGen/Thumb2/mve-be.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-be.ll
@@ -70,10 +70,10 @@ entry:
 define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LE-LABEL: add_soft:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -81,9 +81,9 @@ define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ;
 ; CHECK-BE-LABEL: add_soft:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
index c44a9efe39573..eee41da87423d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
@@ -12,8 +12,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r2, ne
@@ -21,10 +23,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s4, r0
-; CHECK-NEXT:    vldr s5, .LCPI0_0
-; CHECK-NEXT:    vmov.f32 s7, s5
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -76,8 +75,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI4_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r2, ne
@@ -85,10 +86,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s4, r0
-; CHECK-NEXT:    vldr s5, .LCPI4_0
-; CHECK-NEXT:    vmov.f32 s7, s5
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
index f6f51068dd5f8..724bd4f7963b8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
@@ -12,6 +12,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){
 ; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    mov.w r12, #858993459
 ; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    and.w r0, lr, r2, lsr #1
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    and.w r2, r12, r0, lsr #2
@@ -51,7 +52,6 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){
 ; CHECK-NEXT:    vmov s2, r1
 ; CHECK-NEXT:    add.w r0, r2, r0, lsr #24
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-cttz.ll b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
index b844bc217e571..e5d4a93ee4f67 100644
--- a/llvm/test/CodeGen/Thumb2/mve-cttz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
@@ -4,8 +4,7 @@
 define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_0_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -16,7 +15,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s2, r1
-; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -27,8 +28,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s0, r1
-; CHECK-NEXT:    vldr s1, .LCPI0_0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -78,8 +77,7 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_1_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -90,7 +88,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s2, r1
-; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI4_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -101,8 +101,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s0, r1
-; CHECK-NEXT:    vldr s1, .LCPI4_0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index 3a746fc749feb..bb853f698cdfd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -724,11 +724,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) {
 ; CHECK-LABEL: fdiv_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdiv.f32 s11, s3, s7
-; CHECK-NEXT:    vdiv.f32 s10, s2, s6
-; CHECK-NEXT:    vdiv.f32 s9, s1, s5
-; CHECK-NEXT:    vdiv.f32 s8, s0, s4
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vdiv.f32 s3, s3, s7
+; CHECK-NEXT:    vdiv.f32 s2, s2, s6
+; CHECK-NEXT:    vdiv.f32 s1, s1, s5
+; CHECK-NEXT:    vdiv.f32 s0, s0, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fdiv <4 x float> %in1, %in2
@@ -774,27 +773,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-LABEL: fdiv_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s14, s9
-; CHECK-NEXT:    vdiv.f16 s12, s2, s0
-; CHECK-NEXT:    vdiv.f16 s0, s8, s4
-; CHECK-NEXT:    vins.f16 s0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vdiv.f16 s1, s9, s5
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmovx.f16 s14, s10
-; CHECK-NEXT:    vdiv.f16 s2, s10, s6
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vmovx.f16 s14, s11
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vdiv.f16 s3, s11, s7
-; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vmovx.f16 s10, s0
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vdiv.f16 s0, s0, s4
+; CHECK-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vdiv.f16 s1, s1, s5
+; CHECK-NEXT:    vdiv.f16 s4, s8, s4
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vdiv.f16 s2, s2, s6
+; CHECK-NEXT:    vdiv.f16 s4, s8, s4
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s6, s3
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vdiv.f16 s3, s3, s7
+; CHECK-NEXT:    vdiv.f16 s4, s6, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fdiv <8 x half> %in1, %in2
@@ -806,8 +804,8 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q5, q0
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s20
@@ -816,59 +814,59 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s20
-; CHECK-NEXT:    vmov s24, r0
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s24, s24
-; CHECK-NEXT:    vcvtt.f16.f32 s24, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s27, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s27, s0
-; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = frem <8 x half> %in1, %in2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index b5c6c216affa3..4af4ec0b885cd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1422,22 +1422,22 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    ldrd r12, r6, [r0, #4]
-; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vldr.16 s4, .LCPI17_0
+; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    vldr.16 s0, .LCPI17_0
 ; CHECK-NEXT:    lsr.w r9, r3, #1
-; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB17_3
 ; CHECK-NEXT:  .LBB17_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    vstr.16 s8, [r12]
-; CHECK-NEXT:    vmovx.f16 s9, s8
+; CHECK-NEXT:    vmovx.f16 s5, s4
+; CHECK-NEXT:    vstr.16 s4, [r12]
 ; CHECK-NEXT:  .LBB17_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    vstr.16 s9, [r12, #2]
+; CHECK-NEXT:    vstr.16 s5, [r12, #2]
 ; CHECK-NEXT:    adds r6, #10
 ; CHECK-NEXT:    subs r0, #1
 ; CHECK-NEXT:    add.w r12, r12, #4
@@ -1446,15 +1446,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:  .LBB17_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB17_5 Depth 2
-; CHECK-NEXT:    vldrh.u16 q3, [r6]
+; CHECK-NEXT:    vldrh.u16 q2, [r6]
 ; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vshlc q4, r5, #16
+; CHECK-NEXT:    vldrh.u16 q3, [r6, #4]
 ; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vshlc q5, r5, #16
-; CHECK-NEXT:    vldrh.u16 q4, [r6, #4]
-; CHECK-NEXT:    vmov q6, q4
-; CHECK-NEXT:    vshlc q6, r5, #16
-; CHECK-NEXT:    vldrh.u16 q2, [r12]
-; CHECK-NEXT:    vmov.f32 s9, s1
+; CHECK-NEXT:    vldrh.u16 q1, [r12]
+; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    wls lr, r9, .LBB17_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
@@ -1464,19 +1464,19 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    @ Parent Loop BB17_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r7, [r1], #4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vfma.f16 q2, q3, r7
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vfma.f16 q1, q2, r7
 ; CHECK-NEXT:    ldrh r4, [r1, #-2]
-; CHECK-NEXT:    vmov.u16 r7, q2[0]
-; CHECK-NEXT:    vfma.f16 q2, q4, r7
-; CHECK-NEXT:    vins.f16 s9, s4
-; CHECK-NEXT:    vfma.f16 q2, q5, r4
-; CHECK-NEXT:    vmov.u16 r4, q2[1]
-; CHECK-NEXT:    vfma.f16 q2, q6, r4
+; CHECK-NEXT:    vmov.u16 r7, q1[0]
+; CHECK-NEXT:    vfma.f16 q1, q3, r7
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vfma.f16 q1, q4, r4
+; CHECK-NEXT:    vmov.u16 r4, q1[1]
+; CHECK-NEXT:    vfma.f16 q1, q5, r4
 ; CHECK-NEXT:    strh r4, [r5, #2]
-; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
 ; CHECK-NEXT:    strh r7, [r5], #4
-; CHECK-NEXT:    vmov.16 q2[2], r3
+; CHECK-NEXT:    vmov.16 q1[2], r3
 ; CHECK-NEXT:    le lr, .LBB17_5
 ; CHECK-NEXT:  .LBB17_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
@@ -1485,15 +1485,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vfma.f16 q2, q3, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[0]
-; CHECK-NEXT:    vfma.f16 q2, q4, r1
+; CHECK-NEXT:    vfma.f16 q1, q2, r1
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vfma.f16 q1, q3, r1
 ; CHECK-NEXT:    strh r1, [r5]
-; CHECK-NEXT:    vmovx.f16 s6, s8
-; CHECK-NEXT:    vstr.16 s6, [r12]
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vstr.16 s2, [r12]
 ; CHECK-NEXT:    b .LBB17_2
 ; CHECK-NEXT:  .LBB17_8: @ %do.end
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ; CHECK-NEXT:    .p2align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 39ff830e7be63..58177a877338c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1416,8 +1416,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu
 ; CHECK-NEXT:    @ Child Loop BB17_3 Depth 2
 ; CHECK-NEXT:    ldrd r5, r7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r3]
-; CHECK-NEXT:    vldr s8, [r0, #8]
 ; CHECK-NEXT:    ldr r6, [r0, #12]
+; CHECK-NEXT:    vldr s8, [r0, #8]
 ; CHECK-NEXT:    vstrw.32 q1, [r4]
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    vldr s12, [r0, #16]
@@ -1647,8 +1647,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    ldrd r12, r10, [r0]
 ; CHECK-NEXT:    @ implicit-def: $s2
 ; CHECK-NEXT:    and r7, r3, #3
@@ -1656,19 +1656,19 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    lsrs r0, r3, #2
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r2, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s7, s6
 ; CHECK-NEXT:  .LBB19_2: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vstr s8, [r10]
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
-; CHECK-NEXT:    vstr s0, [r10, #4]
+; CHECK-NEXT:    vstr s1, [r10]
 ; CHECK-NEXT:    add.w r9, r9, #128
+; CHECK-NEXT:    vstr s4, [r10, #4]
 ; CHECK-NEXT:    vstr s14, [r10, #8]
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    vstr s7, [r10, #12]
@@ -1677,48 +1677,48 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    vldr s7, [r10, #8]
-; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vldr s8, [r10]
-; CHECK-NEXT:    vldr s10, [r10, #4]
+; CHECK-NEXT:    mov r5, r2
+; CHECK-NEXT:    vldr s1, [r10]
+; CHECK-NEXT:    vldr s3, [r10, #4]
+; CHECK-NEXT:    vldr s7, [r10, #8]
 ; CHECK-NEXT:    vldr s6, [r10, #12]
 ; CHECK-NEXT:    wls lr, r0, .LBB19_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r5, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB19_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vldr s8, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q0, [r9, #112]
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vldr s10, [r1, #8]
 ; CHECK-NEXT:    vmov r7, s7
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
 ; CHECK-NEXT:    vmov r11, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #112]
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vldr s1, [r1, #12]
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vldr s3, [r1, #8]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r9]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov r8, s8
-; CHECK-NEXT:    vldrw.u32 q0, [r9, #16]
+; CHECK-NEXT:    vmov r8, s1
 ; CHECK-NEXT:    ldr r6, [r1, #4]
 ; CHECK-NEXT:    vldrw.u32 q7, [r9, #32]
 ; CHECK-NEXT:    vmul.f32 q1, q1, r8
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vldrw.u32 q3, [r9, #48]
-; CHECK-NEXT:    vfma.f32 q1, q0, r0
+; CHECK-NEXT:    vfma.f32 q1, q2, r0
 ; CHECK-NEXT:    ldr r0, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q7, r6
 ; CHECK-NEXT:    vldrw.u32 q6, [r9, #64]
+; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vfma.f32 q1, q3, r0
 ; CHECK-NEXT:    vldrw.u32 q5, [r9, #80]
 ; CHECK-NEXT:    vfma.f32 q1, q6, r4
 ; CHECK-NEXT:    vldrw.u32 q4, [r9, #96]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q5, r3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q4, r7
-; CHECK-NEXT:    vfma.f32 q1, q0, r11
-; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vfma.f32 q1, q2, r11
 ; CHECK-NEXT:    vstrb.8 q1, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %while.end
@@ -1728,74 +1728,68 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    beq .LBB19_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vldr s24, [r1]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vldr s0, [r1, #4]
-; CHECK-NEXT:    vldrw.u32 q3, [r9]
-; CHECK-NEXT:    vldr s3, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q4, [r9, #32]
-; CHECK-NEXT:    vldr s1, [r1, #8]
-; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r9, #96]
-; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vmov lr, s6
+; CHECK-NEXT:    vldr s6, [r1, #12]
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vstrw.32 q2, [sp, #8] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #112]
+; CHECK-NEXT:    vldr s1, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q3, [r9]
+; CHECK-NEXT:    vldr s4, [r1, #4]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
+; CHECK-NEXT:    vldr s0, [r1]
 ; CHECK-NEXT:    vmul.f32 q3, q3, r6
 ; CHECK-NEXT:    vmov r6, s1
-; CHECK-NEXT:    vstrw.32 q2, [sp, #24] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #112]
+; CHECK-NEXT:    vldrw.u32 q4, [r9, #32]
+; CHECK-NEXT:    vfma.f32 q3, q2, r6
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vldrw.u32 q5, [r9, #48]
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #80]
 ; CHECK-NEXT:    vldrw.u32 q7, [r9, #64]
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vstrw.32 q2, [sp, #8] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    cmp r7, #1
-; CHECK-NEXT:    vfma.f32 q3, q2, r6
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vfma.f32 q3, q4, r4
-; CHECK-NEXT:    vmov lr, s6
 ; CHECK-NEXT:    vfma.f32 q3, q5, r3
+; CHECK-NEXT:    vldrw.u32 q6, [r9, #80]
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vfma.f32 q3, q7, r0
-; CHECK-NEXT:    vfma.f32 q3, q2, r1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #24] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vfma.f32 q3, q6, r1
+; CHECK-NEXT:    cmp r7, #1
 ; CHECK-NEXT:    vfma.f32 q3, q2, r2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q3, q2, lr
 ; CHECK-NEXT:    bne .LBB19_9
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vstr s12, [r5]
-; CHECK-NEXT:    vmov.f32 s8, s24
-; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s1, s0
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s14, s12
-; CHECK-NEXT:    b .LBB19_11
+; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    cmp r7, #2
 ; CHECK-NEXT:    vstmia r5, {s12, s13}
-; CHECK-NEXT:    bne .LBB19_12
+; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s4, s0
 ; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    vmov.f32 s0, s24
 ; CHECK-NEXT:    vmov.f32 s7, s12
-; CHECK-NEXT:  .LBB19_11: @ %if.end69
-; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s2, s3
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB19_2
-; CHECK-NEXT:  .LBB19_12: @ %if.else64
+; CHECK-NEXT:    b .LBB19_12
+; CHECK-NEXT:  .LBB19_11: @ %if.else64
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov.f32 s7, s13
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vstr s14, [r5, #8]
-; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:  .LBB19_12: @ %if.end69
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s2, s6
 ; CHECK-NEXT:    b .LBB19_2
 ; CHECK-NEXT:  .LBB19_13: @ %do.end
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -2026,8 +2020,8 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    b .LBB20_3
 ; CHECK-NEXT:  .LBB20_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vstr s4, [r12]
 ; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vstr s4, [r12]
 ; CHECK-NEXT:  .LBB20_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
 ; CHECK-NEXT:    vstr s6, [r12, #4]
@@ -2209,9 +2203,9 @@ do.end:                                           ; preds = %if.end
 define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) {
 ; CHECK-LABEL: vecAddAcrossF32Mve:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s0, s1
-; CHECK-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = extractelement <4 x float> %in, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
index a212158487286..b13a98666c728 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
@@ -22,25 +22,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1(<8 x half> %src1, <8 x half> %src2,
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -67,25 +67,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2(<8 x half> %src1, <8 x half> %src2,
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -112,25 +112,25 @@ define arm_aapcs_vfpcc <8 x half> @vfms16(<8 x half> %src1, <8 x half> %src2, <8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmls.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmls.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmls.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmls.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -161,22 +161,22 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s5
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s8
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s6
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s8
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vmla.f16 s6, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s8
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s2, s10
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -207,33 +207,33 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmov q3, q0
 ; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s12
 ; CHECK-MVE-NEXT:    vmov.f32 s8, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmla.f16 s8, s2, s0
 ; CHECK-MVE-NEXT:    vmov.f32 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s9, s12
-; CHECK-MVE-NEXT:    vmla.f16 s8, s9, s10
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s12, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
 ; CHECK-MVE-NEXT:    vmov.f32 s1, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s13
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s2
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s13, s5
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vins.f16 s1, s9
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s14
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s2
 ; CHECK-MVE-NEXT:    vmov.f32 s2, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s14
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s9
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s15, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s9
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -364,14 +364,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2
 ;
 ; CHECK-MVE-LABEL: vfmas32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    @ kill: def $s8 killed $s8 def $q2
 ; CHECK-MVE-NEXT:    vmov.f32 s11, s8
-; CHECK-MVE-NEXT:    vmla.f32 s11, s3, s7
 ; CHECK-MVE-NEXT:    vmov.f32 s10, s8
-; CHECK-MVE-NEXT:    vmla.f32 s10, s2, s6
 ; CHECK-MVE-NEXT:    vmov.f32 s9, s8
-; CHECK-MVE-NEXT:    vmla.f32 s9, s1, s5
 ; CHECK-MVE-NEXT:    vmla.f32 s8, s0, s4
+; CHECK-MVE-NEXT:    vmla.f32 s11, s3, s7
+; CHECK-MVE-NEXT:    vmla.f32 s10, s2, s6
+; CHECK-MVE-NEXT:    vmla.f32 s9, s1, s5
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 entry:
@@ -401,8 +400,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ;
 ; CHECK-MVE-LABEL: vfma16_v1_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -417,98 +414,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmla.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -534,8 +529,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ;
 ; CHECK-MVE-LABEL: vfma16_v2_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -550,98 +543,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmla.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -667,8 +658,6 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ;
 ; CHECK-MVE-LABEL: vfms16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -683,98 +672,96 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE-NEXT:    vmls.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmls.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmls.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmls.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmls.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmls.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmls.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmls.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmls.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -805,108 +792,107 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16_pred(<8 x half> %src1, <8 x half> %sr
 ;
 ; CHECK-MVE-LABEL: vfmar16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s10, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s10
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s10, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s14
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s12
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s8, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s0, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmov.f32 s12, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s1
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s5, s12
+; CHECK-MVE-NEXT:    vmla.f16 s10, s5, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vins.f16 s9, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s2
+; CHECK-MVE-NEXT:    vmov.f32 s10, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s13, s6, s12
-; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s10
 ; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s10, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vmov.f32 s10, s6
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s7, s12
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s13
-; CHECK-MVE-NEXT:    vins.f16 s11, s14
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -942,104 +928,103 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s10, #0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s14, s10
+; CHECK-MVE-NEXT:    vmla.f16 s14, s12, s10
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s14, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s12
+; CHECK-MVE-NEXT:    vmla.f16 s12, s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s8, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s0, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s4
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s12
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s1, s5
+; CHECK-MVE-NEXT:    vmla.f16 s10, s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vins.f16 s9, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s12
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmla.f16 s10, s2, s6
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s13, s2, s6
-; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s10
 ; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s10, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s3, s7
+; CHECK-MVE-NEXT:    vmla.f16 s8, s3, s7
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s14
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s8
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -1068,51 +1053,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v1_pred(<4 x float> %src1, <4 x float
 ;
 ; CHECK-MVE-LABEL: vfma32_v1_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1138,51 +1122,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v2_pred(<4 x float> %src1, <4 x float
 ;
 ; CHECK-MVE-LABEL: vfma32_v2_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1208,51 +1191,50 @@ define arm_aapcs_vfpcc <4 x float> @vfms32_pred(<4 x float> %src1, <4 x float> %
 ;
 ; CHECK-MVE-LABEL: vfms32_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmls.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmls.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmls.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmls.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmls.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmls.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmls.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmls.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1281,8 +1263,10 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ;
 ; CHECK-MVE-LABEL: vfmar32_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1291,17 +1275,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vmov.f32 s14, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s9, s2
+; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s8
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
 ; CHECK-MVE-NEXT:    cset r2, ne
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
@@ -1312,20 +1295,18 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s14, s7, s8
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s14, s7, s8
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s8
-; CHECK-MVE-NEXT:    vmla.f32 s12, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s9, s6, s8
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s14
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s9
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
+; CHECK-MVE-NEXT:    vmla.f32 s12, s4, s8
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s12
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0
@@ -1366,15 +1347,15 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
@@ -1388,17 +1369,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    vmla.f32 s14, s3, s7
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s10, s1, s5
-; CHECK-MVE-NEXT:    vmla.f32 s12, s0, s4
 ; CHECK-MVE-NEXT:    vmla.f32 s8, s2, s6
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vmla.f32 s10, s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s10
+; CHECK-MVE-NEXT:    vmla.f32 s12, s0, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s12
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index d75025c012072..cfda5a737e886 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: sqrt_float32_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vsqrt.f32 s7, s3
-; CHECK-NEXT:    vsqrt.f32 s6, s2
-; CHECK-NEXT:    vsqrt.f32 s5, s1
-; CHECK-NEXT:    vsqrt.f32 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vsqrt.f32 s3, s3
+; CHECK-NEXT:    vsqrt.f32 s2, s2
+; CHECK-NEXT:    vsqrt.f32 s1, s1
+; CHECK-NEXT:    vsqrt.f32 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %src)
@@ -19,23 +18,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sqrt_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: sqrt_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vsqrt.f16 s8, s0
-; CHECK-NEXT:    vsqrt.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s2, s6
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s7
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s3, s7
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vsqrt.f16 s0, s0
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s2, s2
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %src)
@@ -101,52 +99,52 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.cos.v8f16(<8 x half> %src)
@@ -212,52 +210,52 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.sin.v8f16(<8 x half> %src)
@@ -323,52 +321,52 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.exp.v8f16(<8 x half> %src)
@@ -434,52 +432,52 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.exp2.v8f16(<8 x half> %src)
@@ -545,52 +543,52 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log.v8f16(<8 x half> %src)
@@ -656,52 +654,52 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log2.v8f16(<8 x half> %src)
@@ -767,52 +765,52 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log10.v8f16(<8 x half> %src)
@@ -883,8 +881,8 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q5, q0
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s20
@@ -893,59 +891,59 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s20
-; CHECK-NEXT:    vmov s24, r0
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s24, s24
-; CHECK-NEXT:    vcvtt.f16.f32 s24, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s27, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s27, s0
-; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.pow.v8f16(<8 x half> %src1, <8 x half> %src2)
@@ -996,8 +994,8 @@ define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x f
 ; CHECK-NEXT:    bfi r5, r1, #31, #1
 ; CHECK-NEXT:    lsr.w r1, r12, #31
 ; CHECK-NEXT:    bfi r3, r1, #31, #1
-; CHECK-NEXT:    vmov s3, r5
 ; CHECK-NEXT:    vmov s2, r4
+; CHECK-NEXT:    vmov s3, r5
 ; CHECK-NEXT:    vmov s1, r0
 ; CHECK-NEXT:    vmov s0, r3
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1013,81 +1011,80 @@ define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x hal
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NEXT:    vstr.16 s8, [sp, #24]
-; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vstr.16 s4, [sp, #28]
-; CHECK-NEXT:    vstr.16 s8, [sp, #16]
-; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vstr.16 s4, [sp, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s6
 ; CHECK-NEXT:    vstr.16 s5, [sp, #20]
-; CHECK-NEXT:    vstr.16 s8, [sp, #8]
-; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-NEXT:    vmovx.f16 s4, s7
 ; CHECK-NEXT:    vstr.16 s6, [sp, #12]
-; CHECK-NEXT:    vstr.16 s8, [sp]
+; CHECK-NEXT:    vstr.16 s4, [sp]
 ; CHECK-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #25]
+; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vabs.f16 s0, s0
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s6, s4
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #29]
-; CHECK-NEXT:    vseleq.f16 s8, s4, s6
-; CHECK-NEXT:    vabs.f16 s4, s0
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vneg.f16 s6, s0
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s6, s4
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s0, s3
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #17]
-; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vseleq.f16 s0, s0, s6
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #21]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vabs.f16 s10, s1
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s6, s1
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s12, s10
+; CHECK-NEXT:    vneg.f16 s8, s6
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #9]
-; CHECK-NEXT:    vseleq.f16 s5, s10, s12
+; CHECK-NEXT:    vseleq.f16 s1, s6, s8
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #13]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vabs.f16 s10, s2
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s2, s2
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s12, s10
+; CHECK-NEXT:    vneg.f16 s6, s2
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vneg.f16 s2, s0
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #1]
-; CHECK-NEXT:    vseleq.f16 s6, s10, s12
+; CHECK-NEXT:    vseleq.f16 s2, s2, s6
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #5]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s6, s3
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s8, s6
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vseleq.f16 s7, s0, s2
-; CHECK-NEXT:    vins.f16 s7, s8
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vseleq.f16 s3, s6, s8
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
index 1ec651ac06de8..0e993f35ce85d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
@@ -5,23 +5,22 @@
 define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fneg_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vneg.f16 s8, s0
-; CHECK-MVE-NEXT:    vneg.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vneg.f16 s0, s0
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fneg_float16_t:
@@ -36,11 +35,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fneg_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fneg_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vneg.f32 s7, s3
-; CHECK-MVE-NEXT:    vneg.f32 s6, s2
-; CHECK-MVE-NEXT:    vneg.f32 s5, s1
-; CHECK-MVE-NEXT:    vneg.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vneg.f32 s3, s3
+; CHECK-MVE-NEXT:    vneg.f32 s2, s2
+; CHECK-MVE-NEXT:    vneg.f32 s1, s1
+; CHECK-MVE-NEXT:    vneg.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fneg_float32_t:
@@ -77,23 +75,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fabs_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vabs.f16 s8, s0
-; CHECK-MVE-NEXT:    vabs.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vabs.f16 s0, s0
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fabs_float16_t:
@@ -108,11 +105,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fabs_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fabs_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vabs.f32 s7, s3
-; CHECK-MVE-NEXT:    vabs.f32 s6, s2
-; CHECK-MVE-NEXT:    vabs.f32 s5, s1
-; CHECK-MVE-NEXT:    vabs.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vabs.f32 s3, s3
+; CHECK-MVE-NEXT:    vabs.f32 s2, s2
+; CHECK-MVE-NEXT:    vabs.f32 s1, s1
+; CHECK-MVE-NEXT:    vabs.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fabs_float32_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll
index 8cfc1418d4d9b..1d7dcc8bf8440 100644
--- a/llvm/test/CodeGen/Thumb2/mve-frint.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @fceil_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fceil_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintp.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintp.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintp.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintp.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintp.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintp.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintp.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintp.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fceil_float32_t:
@@ -24,23 +23,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fceil_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintp.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintp.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fceil_float16_t:
@@ -79,11 +77,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @ftrunc_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: ftrunc_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintz.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintz.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintz.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintz.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintz.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintz.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintz.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintz.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ftrunc_float32_t:
@@ -98,23 +95,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ftrunc_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintz.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintz.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ftrunc_float16_t:
@@ -153,11 +149,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @frint_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: frint_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintx.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintx.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintx.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintx.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintx.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintx.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintx.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintx.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: frint_float32_t:
@@ -172,23 +167,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: frint_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintx.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintx.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: frint_float16_t:
@@ -227,11 +221,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fnearbyint_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: fnearbyint_float32_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrintr.f32 s7, s3
-; CHECK-NEXT:    vrintr.f32 s6, s2
-; CHECK-NEXT:    vrintr.f32 s5, s1
-; CHECK-NEXT:    vrintr.f32 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vrintr.f32 s3, s3
+; CHECK-NEXT:    vrintr.f32 s2, s2
+; CHECK-NEXT:    vrintr.f32 s1, s1
+; CHECK-NEXT:    vrintr.f32 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %src)
@@ -241,23 +234,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: fnearbyint_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vrintr.f16 s8, s0
-; CHECK-NEXT:    vrintr.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s2, s6
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s7
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s3, s7
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vrintr.f16 s0, s0
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s2, s2
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.nearbyint.v8f16(<8 x half> %src)
@@ -291,11 +283,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @ffloor_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: ffloor_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintm.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintm.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintm.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintm.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintm.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintm.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintm.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintm.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ffloor_float32_t:
@@ -310,23 +301,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ffloor_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintm.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintm.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ffloor_float16_t:
@@ -365,11 +355,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fround_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fround_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrinta.f32 s7, s3
-; CHECK-MVE-NEXT:    vrinta.f32 s6, s2
-; CHECK-MVE-NEXT:    vrinta.f32 s5, s1
-; CHECK-MVE-NEXT:    vrinta.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrinta.f32 s3, s3
+; CHECK-MVE-NEXT:    vrinta.f32 s2, s2
+; CHECK-MVE-NEXT:    vrinta.f32 s1, s1
+; CHECK-MVE-NEXT:    vrinta.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fround_float32_t:
@@ -384,23 +373,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fround_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s0
-; CHECK-MVE-NEXT:    vrinta.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrinta.f16 s0, s0
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fround_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index 37e4122ac012c..ac1c0d03c85b5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -89,23 +89,23 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    vldr.16 s8, [r3]
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r2, r3, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vldr.16 s4, [r3]
-; CHECK-NEXT:    vldr.16 s1, [r2]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vldr.16 s2, [r3]
+; CHECK-NEXT:    vldr.16 s1, [r2]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s4
 ; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
 ; CHECK-NEXT:    vins.f16 s3, s4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 816969209ff8d..654e7eea28a1c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -172,10 +172,10 @@ define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) {
 ; CHECK-NEXT:    vldr s3, [r2]
 ; CHECK-NEXT:    vldr s2, [r12]
 ; CHECK-NEXT:    vldr s1, [r1]
+; CHECK-NEXT:    vldr s0, [lr]
 ; CHECK-NEXT:    vldr s7, [r3]
 ; CHECK-NEXT:    vldr s6, [r0]
 ; CHECK-NEXT:    vldr s5, [r5]
-; CHECK-NEXT:    vldr s0, [lr]
 ; CHECK-NEXT:    vldr s4, [r4]
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -413,23 +413,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r1, r2, d2
-; CHECK-NEXT:    vldr.16 s8, [r2]
-; CHECK-NEXT:    vldr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, r2, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    vldr.16 s4, [r2]
-; CHECK-NEXT:    vldr.16 s1, [r1]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldr.16 s1, [r1]
+; CHECK-NEXT:    vldr.16 s2, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vins.f16 s1, s2
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -441,15 +441,15 @@ entry:
 define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vldr.16 s0, [r0]
-; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vldr.16 s4, [r1]
+; CHECK-NEXT:    vldr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vldr.16 s2, [r1]
 ; CHECK-NEXT:    vldr.16 s1, [r0]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <4 x half*>, <4 x half*>* %offptr, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
index 9d4d261a82709..917cec927a993 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
@@ -81,8 +81,6 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8
 define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -97,8 +95,6 @@ entry:
 define arm_aapcs_vfpcc i32* @test_vst2q_u32_post(i32* %addr, %struct.uint32x4x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_u32_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]!
 ; CHECK-NEXT:    bx lr
@@ -116,8 +112,6 @@ declare void @llvm.arm.mve.vst2q.p0i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32)
 define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -132,8 +126,6 @@ entry:
 define arm_aapcs_vfpcc half* @test_vst2q_f16_post(half* %addr, %struct.float16x8x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_f16_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r0]!
 ; CHECK-NEXT:    bx lr
@@ -151,10 +143,6 @@ declare void @llvm.arm.mve.vst2q.p0f16.v8f16(half*, <8 x half>, <8 x half>, i32)
 define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) {
 ; CHECK-LABEL: test_vst4q_s8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    vst40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.8 {q0, q1, q2, q3}, [r0]
@@ -175,10 +163,6 @@ entry:
 define arm_aapcs_vfpcc i8* @test_vst4q_s8_post(i8* %addr, %struct.int8x16x4_t %value.coerce) {
 ; CHECK-LABEL: test_vst4q_s8_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    vst40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.8 {q0, q1, q2, q3}, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index bf601d71761cc..120105cfd14c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vldrw.u32 q5, [r2]
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vand q3, q2, q0
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r4, r1, d4
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s18, s23
 ; CHECK-NEXT:    vmov r3, lr, d0
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov r0, r12, d5
-; CHECK-NEXT:    vmov.f32 s8, s20
-; CHECK-NEXT:    vmov.f32 s10, s21
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov r4, r1, d6
+; CHECK-NEXT:    vmov r0, r12, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r2]
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f32 s6, s13
 ; CHECK-NEXT:    adds r2, r5, r4
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    asr.w r6, r5, #31
 ; CHECK-NEXT:    adcs r1, r6
 ; CHECK-NEXT:    asrl r2, r1, r4
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    adds r6, r1, r3
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    asr.w r4, r1, #31
 ; CHECK-NEXT:    adc.w r1, r4, lr
 ; CHECK-NEXT:    asrl r6, r1, r3
 ; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    asr.w r3, r1, #31
 ; CHECK-NEXT:    adc.w r1, r3, r12
 ; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    asrl r0, r1, r3
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r6, r1, r5
 ; CHECK-NEXT:    asr.w r2, r1, #31
 ; CHECK-NEXT:    adc.w r1, r2, r4
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrl r6, r1, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
@@ -142,30 +142,30 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vldrw.u32 q5, [r2]
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vand q3, q2, q0
 ; CHECK-NEXT:    vand q1, q1, q0
-; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r5, r1, d2
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmov r4, lr, d4
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r0, r12, d3
-; CHECK-NEXT:    vmov.f32 s4, s20
-; CHECK-NEXT:    vmov.f32 s6, s21
+; CHECK-NEXT:    vmov r4, lr, d2
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov r5, r1, d6
+; CHECK-NEXT:    vmov r0, r12, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r2]
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov r6, s4
+; CHECK-NEXT:    vmov.f32 s4, s12
+; CHECK-NEXT:    vmov.f32 s2, s13
 ; CHECK-NEXT:    adds r2, r6, r5
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    asr.w r7, r6, #31
 ; CHECK-NEXT:    adcs r1, r7
 ; CHECK-NEXT:    asrl r2, r1, r5
@@ -175,23 +175,23 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK-NEXT:    asr.w r5, r1, #31
 ; CHECK-NEXT:    adc.w r1, r5, lr
 ; CHECK-NEXT:    asrl r4, r1, r7
-; CHECK-NEXT:    vmov r6, r5, d5
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    vmov r6, r5, d3
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r2
 ; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    asr.w r7, r1, #31
 ; CHECK-NEXT:    adc.w r1, r7, r12
 ; CHECK-NEXT:    vmov r7, s18
 ; CHECK-NEXT:    asrl r0, r1, r7
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r6, r6, r1
 ; CHECK-NEXT:    asr.w r2, r1, #31
 ; CHECK-NEXT:    adc.w r1, r2, r5
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrl r6, r1, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r6, r0
-; CHECK-NEXT:    vstrw.32 q2, [r3]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
+; CHECK-NEXT:    vstrw.32 q1, [r3]
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
@@ -280,9 +280,9 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w r12, r2, r2
 ; CHECK-NEXT:    asr.w r3, r2, #31
 ; CHECK-NEXT:    adc.w r7, r3, r2, asr #31
@@ -370,26 +370,24 @@ define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    ldr.w lr, [sp, #20]
-; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    smull r12, r3, r1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q2, q1, q0
 ; CHECK-NEXT:    asrl r12, r3, r2
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmullb.s32 q1, q0, q2
-; CHECK-NEXT:    vmov r6, r1, d2
-; CHECK-NEXT:    vmov r4, r7, d3
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:    vmov r4, r7, d5
 ; CHECK-NEXT:    asrl r6, r1, r2
 ; CHECK-NEXT:    asrl r4, r7, r2
 ; CHECK-NEXT:    smull r0, r5, r5, r0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index 44fd3e621969c..655a67bad734e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -65,20 +65,20 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: ext_add_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    add.w r12, r1, r0
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    add r1, r2
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    add r0, r3
@@ -184,17 +184,17 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov r12, lr, d7
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    adds r0, r0, r4
 ; CHECK-NEXT:    asr.w r5, r4, #31
 ; CHECK-NEXT:    adcs r1, r5
@@ -205,9 +205,9 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    lsrl r2, r3, #1
 ; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    adds.w r4, r3, r12
 ; CHECK-NEXT:    asr.w r6, r3, #31
 ; CHECK-NEXT:    adc.w r3, r6, lr
@@ -216,8 +216,7 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    adc.w r1, r2, r5
 ; CHECK-NEXT:    lsrl r4, r3, #1
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r4
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %sa = sext <4 x i32> %a to <4 x i64>
@@ -346,11 +345,11 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vand q1, q1, q3
 ; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmov r12, r2, d5
 ; CHECK-NEXT:    vmov r8, r9, d3
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov lr, s2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    adds.w r4, r1, r12
@@ -359,21 +358,21 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-NEXT:    asrl r4, r5, r12
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbc.w r2, r5, r2
-; CHECK-NEXT:    asr.w r5, lr, #31
 ; CHECK-NEXT:    umull r0, r4, r0, r12
 ; CHECK-NEXT:    adds.w r6, lr, r8
+; CHECK-NEXT:    mla r3, r2, r12, r4
+; CHECK-NEXT:    asr.w r5, lr, #31
 ; CHECK-NEXT:    adc.w r5, r5, r9
+; CHECK-NEXT:    rsbs r2, r1, #0
 ; CHECK-NEXT:    asrl r6, r5, r8
-; CHECK-NEXT:    mla r3, r2, r12, r4
+; CHECK-NEXT:    lsll r0, r3, r2
 ; CHECK-NEXT:    subs.w r7, r6, r8
+; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    sbc.w r10, r5, r9
-; CHECK-NEXT:    rsbs r2, r1, #0
 ; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    lsll r0, r3, r2
-; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    lsll r0, r3, r12
-; CHECK-NEXT:    asrs r3, r5, #31
 ; CHECK-NEXT:    adds r4, r5, r6
+; CHECK-NEXT:    asr.w r3, r5, #31
 ; CHECK-NEXT:    adcs r3, r2
 ; CHECK-NEXT:    asrl r4, r3, r6
 ; CHECK-NEXT:    subs r4, r4, r6

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index 2abf5ef19addd..f65ad3e7de22b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -851,15 +851,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB18_5
 ; CHECK-LE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:  .LBB18_5: @ %else8
 ; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-LE-NEXT:    and r3, r2, #1
 ; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
@@ -874,19 +874,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    vmovne r2, s0
 ; CHECK-LE-NEXT:    strne r2, [r0]
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    vmovmi r2, s1
 ; CHECK-LE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    vmovmi r2, s2
 ; CHECK-LE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    vmovmi r1, s3
 ; CHECK-LE-NEXT:    strmi r1, [r0, #12]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r7, pc}
@@ -895,14 +895,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    lsls r3, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB18_2
 ; CHECK-LE-NEXT:  .LBB18_7: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s2
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB18_3
 ; CHECK-LE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vmovx.f16 s2, s0
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bmi .LBB18_4
 ; CHECK-LE-NEXT:    b .LBB18_5
@@ -942,15 +942,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB18_5
 ; CHECK-BE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:  .LBB18_5: @ %else8
 ; CHECK-BE-NEXT:    vmrs r2, p0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -965,19 +965,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s4
+; CHECK-BE-NEXT:    vmovmi r2, s0
 ; CHECK-BE-NEXT:    strmi r2, [r0]
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    vmovmi r2, s1
 ; CHECK-BE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    vmovmi r2, s2
 ; CHECK-BE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s7
+; CHECK-BE-NEXT:    vmovne r1, s3
 ; CHECK-BE-NEXT:    strne r1, [r0, #12]
 ; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    pop {r7, pc}
@@ -986,14 +986,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    lsls r3, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB18_2
 ; CHECK-BE-NEXT:  .LBB18_7: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s2
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB18_3
 ; CHECK-BE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vmovx.f16 s2, s0
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    bne .LBB18_4
 ; CHECK-BE-NEXT:    b .LBB18_5
@@ -1042,15 +1042,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB19_5
 ; CHECK-LE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:  .LBB19_5: @ %else8
 ; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-LE-NEXT:    and r3, r2, #1
 ; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
@@ -1065,19 +1065,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    vmovne r2, s0
 ; CHECK-LE-NEXT:    strne r2, [r0]
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    vmovmi r2, s1
 ; CHECK-LE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    vmovmi r2, s2
 ; CHECK-LE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    vmovmi r1, s3
 ; CHECK-LE-NEXT:    strmi r1, [r0, #12]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r7, pc}
@@ -1086,14 +1086,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    lsls r3, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB19_2
 ; CHECK-LE-NEXT:  .LBB19_7: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s2
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB19_3
 ; CHECK-LE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vmovx.f16 s2, s0
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bmi .LBB19_4
 ; CHECK-LE-NEXT:    b .LBB19_5
@@ -1133,15 +1133,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB19_5
 ; CHECK-BE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:  .LBB19_5: @ %else8
 ; CHECK-BE-NEXT:    vmrs r2, p0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -1156,19 +1156,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s4
+; CHECK-BE-NEXT:    vmovmi r2, s0
 ; CHECK-BE-NEXT:    strmi r2, [r0]
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    vmovmi r2, s1
 ; CHECK-BE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    vmovmi r2, s2
 ; CHECK-BE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s7
+; CHECK-BE-NEXT:    vmovne r1, s3
 ; CHECK-BE-NEXT:    strne r1, [r0, #12]
 ; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    pop {r7, pc}
@@ -1177,14 +1177,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    lsls r3, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB19_2
 ; CHECK-BE-NEXT:  .LBB19_7: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s2
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB19_3
 ; CHECK-BE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vmovx.f16 s2, s0
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    bne .LBB19_4
 ; CHECK-BE-NEXT:    b .LBB19_5

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index c6c989b2ff85e..8c30520d02cd4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -108,8 +108,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -117,8 +117,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4i32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
@@ -137,8 +137,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -146,8 +146,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4i32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
@@ -327,8 +327,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -336,8 +336,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8i16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
@@ -356,8 +356,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -365,8 +365,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8i16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
@@ -405,8 +405,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -414,8 +414,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-LABEL: masked_v16i8_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]!
@@ -434,8 +434,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrbt.8 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -443,8 +443,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-LABEL: masked_v16i8_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrbt.8 q1, [r0], #4
@@ -568,8 +568,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -577,8 +577,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4f32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
@@ -597,8 +597,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -606,8 +606,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4f32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
@@ -709,8 +709,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB16_2
 ; CHECK-LE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #24]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s0
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #24]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #24]
 ; CHECK-LE-NEXT:    strh r2, [r0, #2]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
@@ -722,8 +722,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB16_4
 ; CHECK-LE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #16]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s1
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #16]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #16]
 ; CHECK-LE-NEXT:    strh r2, [r0, #6]
 ; CHECK-LE-NEXT:    lsls r2, r1, #27
@@ -735,8 +735,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #26
 ; CHECK-LE-NEXT:    bpl .LBB16_6
 ; CHECK-LE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s2
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #8]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #8]
 ; CHECK-LE-NEXT:    strh r2, [r0, #10]
 ; CHECK-LE-NEXT:    lsls r2, r1, #25
@@ -877,8 +877,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -886,8 +886,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8f16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
@@ -906,8 +906,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -915,8 +915,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8f16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
@@ -1253,12 +1253,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB25_5
@@ -1328,12 +1328,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB25_5
@@ -1354,8 +1354,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB25_2
 ; CHECK-BE-NEXT:  .LBB25_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB25_3
 ; CHECK-BE-NEXT:  .LBB25_7: @ %cond.store3
@@ -1409,12 +1409,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB26_5
@@ -1484,12 +1484,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB26_5
@@ -1510,8 +1510,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB26_2
 ; CHECK-BE-NEXT:  .LBB26_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB26_3
 ; CHECK-BE-NEXT:  .LBB26_7: @ %cond.store3
@@ -1565,12 +1565,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB27_5
@@ -1648,12 +1648,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB27_5
@@ -1676,8 +1676,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB27_2
 ; CHECK-BE-NEXT:  .LBB27_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [sp, #8]
 ; CHECK-BE-NEXT:    ldrh.w r2, [sp, #8]
 ; CHECK-BE-NEXT:    strh r2, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index 6b2939c3a0c1b..912773e2d5131 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -279,11 +279,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @maxnm_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: maxnm_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmaxnm.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vmaxnm.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vmaxnm.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: maxnm_float32_t:
@@ -299,27 +298,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: minnm_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vminnm.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vminnm.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vminnm.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vminnm.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vminnm.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: minnm_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
index 24eef30656e95..ded3d3141e361 100644
--- a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
@@ -104,20 +104,20 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float>
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NOFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    vmov q5, q1
-; CHECK-NOFP-NEXT:    vmov q6, q0
-; CHECK-NOFP-NEXT:    vmov r4, r0, d13
-; CHECK-NOFP-NEXT:    vmov r5, r1, d11
+; CHECK-NOFP-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NOFP-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NOFP-NEXT:    vmov q4, q1
+; CHECK-NOFP-NEXT:    vmov q5, q0
+; CHECK-NOFP-NEXT:    vmov r4, r0, d11
+; CHECK-NOFP-NEXT:    vmov r5, r1, d9
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s19, r0
 ; CHECK-NOFP-NEXT:    mov r0, r4
 ; CHECK-NOFP-NEXT:    mov r1, r5
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s18, r0
-; CHECK-NOFP-NEXT:    vmov r4, r0, d12
-; CHECK-NOFP-NEXT:    vmov r5, r1, d10
+; CHECK-NOFP-NEXT:    vmov r4, r0, d10
+; CHECK-NOFP-NEXT:    vmov r5, r1, d8
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s17, r0
 ; CHECK-NOFP-NEXT:    mov r0, r4
@@ -125,7 +125,7 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float>
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s16, r0
 ; CHECK-NOFP-NEXT:    vmov q0, q4
-; CHECK-NOFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NOFP-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 8c8667f1762a5..8a1109950c03a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -152,40 +152,39 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    .pad #408
 ; CHECK-NEXT:    sub sp, #408
 ; CHECK-NEXT:    movw r7, :lower16:.L_MergedGlobals
-; CHECK-NEXT:    vldr s12, .LCPI1_0
-; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s15, .LCPI1_1
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    ldr r0, [r3, #4]!
+; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    movw r2, :lower16:e
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    ldr r6, [r4, #8]!
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    ldr r0, [r3, #4]!
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov s21, r2
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov s13, r3
+; CHECK-NEXT:    vldr s12, .LCPI1_0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
-; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    vdup.32 q7, r3
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #92]
 ; CHECK-NEXT:    vmov q0, q7
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vmov q4, q7
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q7[1], r2
-; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov s21, r2
 ; CHECK-NEXT:    movs r1, #64
+; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    str r0, [sp, #40]
-; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    str r6, [r0]
-; CHECK-NEXT:    vstrw.32 q7, [r0]
+; CHECK-NEXT:    vmov.f32 s23, s15
 ; CHECK-NEXT:    str r0, [r0]
+; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q7, [r0]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q6, [r0]
 ; CHECK-NEXT:    mov.w r8, #0
@@ -193,6 +192,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r3
 ; CHECK-NEXT:    mov.w r12, #4
 ; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
+; CHECK-NEXT:    vmov.f32 s14, s13
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vmov.32 q4[0], r8
 ; CHECK-NEXT:    @ implicit-def: $r2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index 17a47e5ec54c7..9f44be17172fb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -34,13 +34,13 @@ define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add.w r7, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    adds r0, #1
 ; CHECK-NEXT:    add r3, r9
 ; CHECK-NEXT:    cmp r0, r12
-; CHECK-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s0, [r7]
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
@@ -138,15 +138,15 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB1_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s8, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    add r11, r10
-; CHECK-NEXT:    vadd.f32 s2, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    add r6, r10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s0, s0, s8
-; CHECK-NEXT:    vadd.f32 s2, s4, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s2, s4, s6
 ; CHECK-NEXT:    vstr s0, [r0]
 ; CHECK-NEXT:    add.w r0, r2, r4, lsl #2
 ; CHECK-NEXT:    adds r4, #2
@@ -279,21 +279,21 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB2_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s12, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    adds r0, r5, #1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add r10, r11
-; CHECK-NEXT:    vadd.f32 s10, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r12, r11
-; CHECK-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add r8, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s8, s12
-; CHECK-NEXT:    vadd.f32 s4, s4, s10
-; CHECK-NEXT:    vadd.f32 s0, s0, s6
-; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vstr s8, [r0]
 ; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
 ; CHECK-NEXT:    adds r0, r5, #2
@@ -450,22 +450,22 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB3_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s16, s14, s15
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    adds r0, r6, #1
-; CHECK-NEXT:    vadd.f32 s14, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s10, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s12, s16
-; CHECK-NEXT:    vadd.f32 s8, s8, s14
-; CHECK-NEXT:    vadd.f32 s4, s4, s10
-; CHECK-NEXT:    vadd.f32 s0, s0, s6
-; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vstr s12, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
 ; CHECK-NEXT:    vstr s8, [r0]
 ; CHECK-NEXT:    adds r0, r6, #2
@@ -645,26 +645,26 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB4_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s20, s18, s19
+; CHECK-NEXT:    vadd.f32 s18, s18, s19
 ; CHECK-NEXT:    add.w r3, r2, r11, lsl #2
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
-; CHECK-NEXT:    vadd.f32 s18, s14, s15
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s10, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s16, s20
-; CHECK-NEXT:    vadd.f32 s12, s12, s18
-; CHECK-NEXT:    vadd.f32 s4, s4, s14
-; CHECK-NEXT:    vadd.f32 s6, s8, s6
-; CHECK-NEXT:    vadd.f32 s0, s0, s10
-; CHECK-NEXT:    vstr s2, [r3]
+; CHECK-NEXT:    vadd.f32 s1, s16, s18
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s6, s8, s10
+; CHECK-NEXT:    vstr s1, [r3]
 ; CHECK-NEXT:    add.w r3, r2, r0, lsl #2
 ; CHECK-NEXT:    vstr s12, [r3]
 ; CHECK-NEXT:    adds r3, r0, #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    add.w r3, r2, r3, lsl #2
 ; CHECK-NEXT:    vstr s6, [r3]
 ; CHECK-NEXT:    adds r3, r0, #3
@@ -858,32 +858,32 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB5_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s24, s22, s23
+; CHECK-NEXT:    vadd.f32 s22, s22, s23
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s20, s20, s21
-; CHECK-NEXT:    vadd.f32 s22, s18, s19
+; CHECK-NEXT:    vadd.f32 s18, s18, s19
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
-; CHECK-NEXT:    vadd.f32 s18, s6, s7
-; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s14, s15
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s10, s2, s3
-; CHECK-NEXT:    vadd.f32 s2, s20, s24
-; CHECK-NEXT:    vadd.f32 s1, s16, s22
-; CHECK-NEXT:    vadd.f32 s6, s12, s6
-; CHECK-NEXT:    vadd.f32 s4, s4, s18
-; CHECK-NEXT:    vadd.f32 s8, s8, s14
-; CHECK-NEXT:    vadd.f32 s0, s0, s10
-; CHECK-NEXT:    vstr s2, [r1]
-; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
+; CHECK-NEXT:    vadd.f32 s1, s20, s22
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
+; CHECK-NEXT:    vadd.f32 s3, s16, s18
+; CHECK-NEXT:    vadd.f32 s4, s4, s5
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vstr s1, [r1]
+; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vstr s3, [r1]
 ; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vadd.f32 s6, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    adds r1, r0, #4
@@ -1089,19 +1089,17 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q1, q3
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov q4, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
@@ -1122,32 +1120,32 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vadd.f32 s0, s26, s27
 ; CHECK-NEXT:    add.w r1, r2, r12, lsl #2
 ; CHECK-NEXT:    vadd.f32 s2, s24, s25
-; CHECK-NEXT:    vadd.f32 s3, s20, s21
 ; CHECK-NEXT:    vadd.f32 s1, s22, s23
-; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s20, s10, s11
-; CHECK-NEXT:    vadd.f32 s11, s14, s15
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s6, s7
+; CHECK-NEXT:    vadd.f32 s3, s20, s21
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s10, s18, s19
-; CHECK-NEXT:    vadd.f32 s9, s16, s17
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
+; CHECK-NEXT:    vadd.f32 s8, s8, s9
+; CHECK-NEXT:    vadd.f32 s9, s18, s19
+; CHECK-NEXT:    vadd.f32 s11, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vadd.f32 s6, s18, s19
-; CHECK-NEXT:    vadd.f32 s5, s16, s17
+; CHECK-NEXT:    vadd.f32 s5, s18, s19
+; CHECK-NEXT:    vadd.f32 s7, s16, s17
 ; CHECK-NEXT:    vadd.f32 s2, s3, s1
-; CHECK-NEXT:    vadd.f32 s4, s4, s14
-; CHECK-NEXT:    vadd.f32 s12, s12, s11
-; CHECK-NEXT:    vadd.f32 s10, s9, s10
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s8, s8, s20
-; CHECK-NEXT:    vadd.f32 s6, s5, s6
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s6, s7, s5
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
+; CHECK-NEXT:    vadd.f32 s10, s11, s9
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
 ; CHECK-NEXT:    adds r1, r0, #4
@@ -1408,33 +1406,33 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vadd.f32 s0, s30, s31
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
 ; CHECK-NEXT:    vadd.f32 s2, s28, s29
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s5, s14, s15
 ; CHECK-NEXT:    vadd.f32 s4, s26, s27
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
-; CHECK-NEXT:    vadd.f32 s14, s18, s19
+; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s13, s10, s11
-; CHECK-NEXT:    vadd.f32 s10, s18, s19
-; CHECK-NEXT:    vadd.f32 s9, s16, s17
+; CHECK-NEXT:    vadd.f32 s9, s18, s19
+; CHECK-NEXT:    vadd.f32 s11, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vadd.f32 s11, s18, s19
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
+; CHECK-NEXT:    vadd.f32 s13, s18, s19
 ; CHECK-NEXT:    vadd.f32 s15, s16, s17
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s2, s6, s4
-; CHECK-NEXT:    vadd.f32 s6, s12, s5
-; CHECK-NEXT:    vadd.f32 s12, s7, s14
-; CHECK-NEXT:    vadd.f32 s10, s9, s10
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s10, s11, s9
+; CHECK-NEXT:    vadd.f32 s6, s12, s14
+; CHECK-NEXT:    vadd.f32 s1, s22, s23
+; CHECK-NEXT:    vadd.f32 s14, s15, s13
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s8, s8, s13
-; CHECK-NEXT:    vadd.f32 s14, s15, s11
+; CHECK-NEXT:    vadd.f32 s3, s20, s21
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r12, lsl #2
-; CHECK-NEXT:    vadd.f32 s1, s22, s23
-; CHECK-NEXT:    vadd.f32 s3, s20, s21
+; CHECK-NEXT:    vadd.f32 s12, s7, s5
 ; CHECK-NEXT:    vstr s10, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
index 0f1b483a0eb1e..b4f7a8ca4d47d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
@@ -159,11 +159,11 @@ define arm_aapcs_vfpcc <2 x i64> @build_var0_v2i1(i32 %s, i32 %t, <2 x i64> %a,
 ; CHECK-LABEL: build_var0_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    vldr s10, .LCPI9_0
 ; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov s8, r0
-; CHECK-NEXT:    vldr s10, .LCPI9_0
-; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vmov.f32 s11, s10
+; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -183,9 +183,9 @@ define arm_aapcs_vfpcc <2 x i64> @build_var1_v2i1(i32 %s, i32 %t, <2 x i64> %a,
 ; CHECK-LABEL: build_var1_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    vldr s8, .LCPI10_0
 ; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov s10, r0
-; CHECK-NEXT:    vldr s8, .LCPI10_0
 ; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vmov.f32 s11, s10
 ; CHECK-NEXT:    vbic q1, q1, q2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index e9ae87165b455..477db0718410e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -79,9 +79,9 @@ entry:
 define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: shuffle2_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i32 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -100,9 +100,9 @@ entry:
 define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: shuffle2_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i16 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -121,9 +121,9 @@ entry:
 define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: shuffle2_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i8 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -223,9 +223,9 @@ entry:
 define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: shuffle4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vcmp.i32 eq, q0, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index ca9a725c79abd..3f7b0e6a437b1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -253,16 +253,15 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    vmov r4, r7, d13
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov.f32 s10, s9
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    sbcs.w r5, r2, r7
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
@@ -306,10 +305,11 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    csetm r4, ne
 ; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
 ; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vmov.f32 s10, s13
 ; CHECK-NEXT:    vbic q6, q1, q5
 ; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vorr q4, q4, q6
+; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    smull r4, r7, r4, r3
@@ -522,17 +522,15 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vorr q4, q4, q0
 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
 ; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT:    vmov.f32 s24, s18
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
-; CHECK-NEXT:    vmov.f32 s28, s22
+; CHECK-NEXT:    vmov.f32 s24, s18
 ; CHECK-NEXT:    vmov.f32 s26, s19
+; CHECK-NEXT:    vmov.f32 s28, s22
 ; CHECK-NEXT:    vmov.f32 s30, s23
 ; CHECK-NEXT:    vmullb.s32 q0, q7, q6
-; CHECK-NEXT:    vmov.f32 s18, s17
 ; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    vmov.f32 s22, s21
 ; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    sbcs.w r7, r12, r5
 ; CHECK-NEXT:    mov.w r7, #0
@@ -575,11 +573,13 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csetm r4, ne
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r4
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s22
 ; CHECK-NEXT:    vbic q7, q3, q0
 ; CHECK-NEXT:    vand q0, q6, q0
 ; CHECK-NEXT:    vorr q6, q0, q7
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    smull r6, r5, r4, r3
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    asrl r6, r5, #31

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
index 0e80c6241c041..a4b3632e4dd7e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -93,23 +93,23 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <
 ; CHECK-LABEL: scaled_v8f16_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q1, [r1]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vshl.i32 q2, q1, #1
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
index f549cebe304e9..fba6524589e59 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -111,20 +111,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q2, [r1]
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
@@ -184,20 +184,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
index affb361febd68..4c2ef5e01e28c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
@@ -291,8 +291,8 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vldrb.s32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r0, q2]
@@ -310,8 +310,8 @@ entry:
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vldrb.u32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r0, q2]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
index e8daac426b4cf..edd8a07166e4a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -312,18 +312,18 @@ define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    vstr.16 s12, [r1]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
@@ -339,10 +339,10 @@ define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 54249151d448e..bdf5fb2354ed2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -52,30 +52,29 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q5, [r0]
-; CHECK-NEXT:    vmov.f64 d8, d10
-; CHECK-NEXT:    vmov.f32 s18, s21
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s2, s23
 ; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    vmov r2, s26
+; CHECK-NEXT:    vmov.f32 s20, s22
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov d11, r0, r1
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
@@ -84,7 +83,7 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vmov d10, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index b66e7b24536cf..6ede494c81ea7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -118,8 +118,8 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) {
 ; CHECK-NEXT:    vmov.f32 s9, s3
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    bx lr
@@ -135,17 +135,17 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vmov.f32 s16, s0
 ; CHECK-NEXT:    vmov.f32 s13, s4
-; CHECK-NEXT:    vmov.f32 s17, s3
 ; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov.f32 s15, s10
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vmov.f32 s17, s3
 ; CHECK-NEXT:    vmov.f32 s19, s9
 ; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vadd.i32 q0, q3, q1
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -167,18 +167,18 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) {
 ; CHECK-NEXT:    vmov.f32 s16, s3
 ; CHECK-NEXT:    vmov.f32 s20, s2
 ; CHECK-NEXT:    vmov.f32 s17, s7
-; CHECK-NEXT:    vmov.f32 s21, s6
 ; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s19, s15
+; CHECK-NEXT:    vmov.f32 s21, s6
+; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s23, s14
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s1
 ; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s22, s9
-; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vadd.i32 q0, q0, q5
 ; CHECK-NEXT:    vadd.i32 q0, q0, q4
@@ -202,12 +202,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -228,14 +228,14 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle3_i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s1, s7
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s0, s4
 ; CHECK-NEXT:    vins.f16 s5, s4
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmovx.f16 s1, s7
 ; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -323,30 +323,27 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
 ; CHECK-LABEL: shuffle2step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmovx.f16 s9, s2
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vins.f16 s2, s3
-; CHECK-NEXT:    vmovx.f16 s10, s4
-; CHECK-NEXT:    vmovx.f16 s16, s1
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s8, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NEXT:    vins.f16 s9, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s4, s5
 ; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vmov.f32 s13, s2
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s11, s6
+; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vmov.f32 s13, s2
 ; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vmov.f32 s14, s4
 ; CHECK-NEXT:    vmov.f32 s15, s6
 ; CHECK-NEXT:    vadd.i16 q0, q3, q2
-; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -358,51 +355,54 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle3step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    .vsave {d11, d12, d13}
+; CHECK-NEXT:    vpush {d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s0
-; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmovx.f16 s14, s1
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmovx.f16 s14, s4
 ; CHECK-NEXT:    vmov.f32 s13, s3
-; CHECK-NEXT:    vmovx.f16 s20, s5
-; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmovx.f16 s15, s7
+; CHECK-NEXT:    vins.f16 s13, s14
 ; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vins.f16 s14, s16
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vins.f16 s1, s16
+; CHECK-NEXT:    vins.f16 s14, s15
+; CHECK-NEXT:    vmovx.f16 s15, s2
+; CHECK-NEXT:    vins.f16 s1, s15
+; CHECK-NEXT:    vmovx.f16 s15, s5
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vmov.f32 s15, s9
-; CHECK-NEXT:    vins.f16 s15, s20
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vins.f16 s10, s20
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vins.f16 s17, s15
 ; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vmovx.f16 s1, s10
+; CHECK-NEXT:    vmov.f32 s15, s9
+; CHECK-NEXT:    vins.f16 s15, s1
+; CHECK-NEXT:    vmovx.f16 s1, s11
+; CHECK-NEXT:    vins.f16 s10, s1
+; CHECK-NEXT:    vmovx.f16 s1, s3
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmovx.f16 s7, s9
 ; CHECK-NEXT:    vmov.f32 s23, s10
 ; CHECK-NEXT:    vmov.f32 s22, s8
-; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s1, s5
 ; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vins.f16 s7, s11
 ; CHECK-NEXT:    vmovnb.i32 q6, q4
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmovnb.i32 q2, q0
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s2, s10
 ; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s3, s9
-; CHECK-NEXT:    vins.f16 s21, s5
-; CHECK-NEXT:    vins.f16 s3, s11
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s23, s3
-; CHECK-NEXT:    vadd.i16 q0, q3, q5
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -416,53 +416,51 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle4step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmovx.f16 s18, s9
-; CHECK-NEXT:    vins.f16 s18, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vins.f16 s18, s16
 ; CHECK-NEXT:    vmovx.f16 s19, s13
-; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s16, s15
 ; CHECK-NEXT:    vmovx.f16 s20, s3
+; CHECK-NEXT:    vins.f16 s19, s16
 ; CHECK-NEXT:    vmovx.f16 s16, s1
-; CHECK-NEXT:    vins.f16 s13, s15
 ; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s7
 ; CHECK-NEXT:    vmovx.f16 s17, s5
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmovx.f16 s1, s10
 ; CHECK-NEXT:    vmov.f32 s22, s9
 ; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmov.f32 s20, s1
-; CHECK-NEXT:    vmovx.f16 s24, s10
 ; CHECK-NEXT:    vmov.f32 s21, s5
 ; CHECK-NEXT:    vadd.i16 q4, q5, q4
 ; CHECK-NEXT:    vmovx.f16 s22, s8
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s12
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmov.f32 s10, s8
-; CHECK-NEXT:    vmov.f32 s11, s12
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s1, s14
 ; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vins.f16 s23, s1
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vins.f16 s20, s1
 ; CHECK-NEXT:    vmovx.f16 s21, s4
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vins.f16 s21, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -769,12 +767,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q5[11], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.8 q4[1], r0
@@ -797,19 +794,20 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
 ; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.f32 s14, s22
 ; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q5[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.8 q5[15], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q6[11], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.f32 s18, s26
 ; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vadd.i8 q3, q4, q3
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[5]
@@ -1028,8 +1026,8 @@ define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
 ; CHECK-LABEL: shuffle2_i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -1146,8 +1144,8 @@ define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) {
 ; CHECKFP-NEXT:    vmov.f32 s9, s3
 ; CHECKFP-NEXT:    vmov.f32 s1, s2
 ; CHECKFP-NEXT:    vmov.f32 s10, s5
-; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s11, s7
+; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s3, s6
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q2
 ; CHECKFP-NEXT:    bx lr
@@ -1163,17 +1161,17 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
 ; CHECKFP:       @ %bb.0: @ %entry
 ; CHECKFP-NEXT:    .vsave {d8, d9}
 ; CHECKFP-NEXT:    vpush {d8, d9}
-; CHECKFP-NEXT:    vmov.f32 s12, s1
-; CHECKFP-NEXT:    vmov.f32 s16, s0
 ; CHECKFP-NEXT:    vmov.f32 s13, s4
-; CHECKFP-NEXT:    vmov.f32 s17, s3
 ; CHECKFP-NEXT:    vmov.f32 s14, s7
 ; CHECKFP-NEXT:    vmov.f32 s18, s6
-; CHECKFP-NEXT:    vmov.f32 s4, s2
-; CHECKFP-NEXT:    vmov.f32 s6, s8
+; CHECKFP-NEXT:    vmov.f32 s12, s1
 ; CHECKFP-NEXT:    vmov.f32 s15, s10
+; CHECKFP-NEXT:    vmov.f32 s16, s0
+; CHECKFP-NEXT:    vmov.f32 s17, s3
 ; CHECKFP-NEXT:    vmov.f32 s19, s9
 ; CHECKFP-NEXT:    vadd.f32 q3, q4, q3
+; CHECKFP-NEXT:    vmov.f32 s4, s2
+; CHECKFP-NEXT:    vmov.f32 s6, s8
 ; CHECKFP-NEXT:    vmov.f32 s7, s11
 ; CHECKFP-NEXT:    vadd.f32 q0, q3, q1
 ; CHECKFP-NEXT:    vpop {d8, d9}
@@ -1195,18 +1193,18 @@ define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) {
 ; CHECKFP-NEXT:    vmov.f32 s16, s3
 ; CHECKFP-NEXT:    vmov.f32 s20, s2
 ; CHECKFP-NEXT:    vmov.f32 s17, s7
-; CHECKFP-NEXT:    vmov.f32 s21, s6
 ; CHECKFP-NEXT:    vmov.f32 s18, s11
-; CHECKFP-NEXT:    vmov.f32 s22, s10
 ; CHECKFP-NEXT:    vmov.f32 s19, s15
+; CHECKFP-NEXT:    vmov.f32 s21, s6
+; CHECKFP-NEXT:    vmov.f32 s22, s10
 ; CHECKFP-NEXT:    vmov.f32 s23, s14
 ; CHECKFP-NEXT:    vadd.f32 q4, q5, q4
 ; CHECKFP-NEXT:    vmov.f32 s20, s1
 ; CHECKFP-NEXT:    vmov.f32 s21, s5
-; CHECKFP-NEXT:    vmov.f32 s1, s4
 ; CHECKFP-NEXT:    vmov.f32 s22, s9
-; CHECKFP-NEXT:    vmov.f32 s2, s8
 ; CHECKFP-NEXT:    vmov.f32 s23, s13
+; CHECKFP-NEXT:    vmov.f32 s1, s4
+; CHECKFP-NEXT:    vmov.f32 s2, s8
 ; CHECKFP-NEXT:    vmov.f32 s3, s12
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q5
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q4
@@ -1230,12 +1228,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1256,14 +1254,14 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
 ; CHECK-LABEL: shuffle3_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s1, s7
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s0, s4
 ; CHECK-NEXT:    vins.f16 s5, s4
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmovx.f16 s1, s7
 ; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -1340,24 +1338,24 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
 ; CHECKFP-LABEL: shuffle2step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    vmovx.f16 s12, s1
 ; CHECKFP-NEXT:    vmovx.f16 s8, s0
-; CHECKFP-NEXT:    vins.f16 s8, s12
-; CHECKFP-NEXT:    vmovx.f16 s12, s3
+; CHECKFP-NEXT:    vmovx.f16 s10, s1
+; CHECKFP-NEXT:    vins.f16 s8, s10
 ; CHECKFP-NEXT:    vmovx.f16 s9, s2
-; CHECKFP-NEXT:    vins.f16 s0, s1
-; CHECKFP-NEXT:    vins.f16 s9, s12
-; CHECKFP-NEXT:    vins.f16 s2, s3
+; CHECKFP-NEXT:    vmovx.f16 s10, s3
 ; CHECKFP-NEXT:    vmovx.f16 s12, s5
+; CHECKFP-NEXT:    vins.f16 s9, s10
 ; CHECKFP-NEXT:    vmovx.f16 s10, s4
 ; CHECKFP-NEXT:    vins.f16 s10, s12
-; CHECKFP-NEXT:    vins.f16 s4, s5
-; CHECKFP-NEXT:    vmov.f32 s1, s2
-; CHECKFP-NEXT:    vmovx.f16 s12, s7
 ; CHECKFP-NEXT:    vmovx.f16 s11, s6
+; CHECKFP-NEXT:    vmovx.f16 s12, s7
+; CHECKFP-NEXT:    vins.f16 s2, s3
 ; CHECKFP-NEXT:    vins.f16 s6, s7
-; CHECKFP-NEXT:    vmov.f32 s2, s4
+; CHECKFP-NEXT:    vins.f16 s4, s5
+; CHECKFP-NEXT:    vins.f16 s0, s1
+; CHECKFP-NEXT:    vmov.f32 s1, s2
 ; CHECKFP-NEXT:    vins.f16 s11, s12
+; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s3, s6
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q2
 ; CHECKFP-NEXT:    bx lr
@@ -1371,45 +1369,43 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
 ; CHECKFP-LABEL: shuffle3step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECKFP-NEXT:    vmovx.f16 s16, s2
+; CHECKFP-NEXT:    .vsave {d8, d9, d10}
+; CHECKFP-NEXT:    vpush {d8, d9, d10}
 ; CHECKFP-NEXT:    vmov.f32 s12, s1
-; CHECKFP-NEXT:    vins.f16 s12, s16
-; CHECKFP-NEXT:    vmovx.f16 s16, s5
+; CHECKFP-NEXT:    vmovx.f16 s14, s2
+; CHECKFP-NEXT:    vins.f16 s12, s14
 ; CHECKFP-NEXT:    vmov.f32 s13, s4
-; CHECKFP-NEXT:    vmovx.f16 s20, s11
-; CHECKFP-NEXT:    vins.f16 s13, s16
-; CHECKFP-NEXT:    vmov.f32 s19, s10
-; CHECKFP-NEXT:    vins.f16 s19, s20
+; CHECKFP-NEXT:    vmovx.f16 s14, s5
+; CHECKFP-NEXT:    vmov.f32 s15, s10
+; CHECKFP-NEXT:    vins.f16 s13, s14
+; CHECKFP-NEXT:    vmovx.f16 s14, s11
+; CHECKFP-NEXT:    vins.f16 s15, s14
 ; CHECKFP-NEXT:    vmov.f32 s14, s7
-; CHECKFP-NEXT:    vmovx.f16 s20, s8
-; CHECKFP-NEXT:    vmov.f32 s28, s6
-; CHECKFP-NEXT:    vins.f16 s14, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s7
-; CHECKFP-NEXT:    vins.f16 s28, s20
-; CHECKFP-NEXT:    vmovx.f16 s24, s1
-; CHECKFP-NEXT:    vmovx.f16 s20, s0
-; CHECKFP-NEXT:    vins.f16 s0, s24
-; CHECKFP-NEXT:    vins.f16 s20, s2
-; CHECKFP-NEXT:    vmovx.f16 s26, s4
-; CHECKFP-NEXT:    vmovx.f16 s21, s3
-; CHECKFP-NEXT:    vins.f16 s3, s26
-; CHECKFP-NEXT:    vins.f16 s21, s5
-; CHECKFP-NEXT:    vmovx.f16 s30, s10
-; CHECKFP-NEXT:    vmovx.f16 s23, s9
-; CHECKFP-NEXT:    vmov.f32 s18, s8
+; CHECKFP-NEXT:    vmovx.f16 s16, s8
+; CHECKFP-NEXT:    vmovx.f16 s4, s4
+; CHECKFP-NEXT:    vmovx.f16 s7, s7
+; CHECKFP-NEXT:    vmov.f32 s20, s6
+; CHECKFP-NEXT:    vmovx.f16 s10, s10
+; CHECKFP-NEXT:    vmovx.f16 s17, s3
+; CHECKFP-NEXT:    vmovx.f16 s19, s9
+; CHECKFP-NEXT:    vmovx.f16 s18, s6
+; CHECKFP-NEXT:    vins.f16 s14, s16
+; CHECKFP-NEXT:    vmovx.f16 s16, s0
+; CHECKFP-NEXT:    vmovx.f16 s1, s1
+; CHECKFP-NEXT:    vins.f16 s20, s7
+; CHECKFP-NEXT:    vins.f16 s3, s4
+; CHECKFP-NEXT:    vins.f16 s9, s10
+; CHECKFP-NEXT:    vins.f16 s0, s1
+; CHECKFP-NEXT:    vins.f16 s16, s2
 ; CHECKFP-NEXT:    vmov.f32 s1, s3
-; CHECKFP-NEXT:    vins.f16 s9, s30
-; CHECKFP-NEXT:    vins.f16 s23, s11
-; CHECKFP-NEXT:    vmov.f32 s2, s28
-; CHECKFP-NEXT:    vmovx.f16 s22, s6
+; CHECKFP-NEXT:    vins.f16 s17, s5
+; CHECKFP-NEXT:    vins.f16 s19, s11
+; CHECKFP-NEXT:    vins.f16 s18, s8
+; CHECKFP-NEXT:    vmov.f32 s2, s20
 ; CHECKFP-NEXT:    vmov.f32 s3, s9
-; CHECKFP-NEXT:    vins.f16 s22, s8
-; CHECKFP-NEXT:    vmov.f32 s15, s19
-; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q3
-; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECKFP-NEXT:    vpop {d8, d9, d10}
 ; CHECKFP-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1425,47 +1421,47 @@ define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
 ; CHECKFP:       @ %bb.0: @ %entry
 ; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECKFP-NEXT:    vmovx.f16 s20, s11
 ; CHECKFP-NEXT:    vmovx.f16 s18, s9
-; CHECKFP-NEXT:    vins.f16 s18, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s15
+; CHECKFP-NEXT:    vmovx.f16 s16, s11
+; CHECKFP-NEXT:    vins.f16 s18, s16
 ; CHECKFP-NEXT:    vmovx.f16 s19, s13
-; CHECKFP-NEXT:    vins.f16 s9, s11
-; CHECKFP-NEXT:    vins.f16 s19, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s3
+; CHECKFP-NEXT:    vmovx.f16 s16, s15
+; CHECKFP-NEXT:    vmovx.f16 s22, s8
+; CHECKFP-NEXT:    vins.f16 s19, s16
 ; CHECKFP-NEXT:    vmovx.f16 s16, s1
-; CHECKFP-NEXT:    vmovx.f16 s24, s10
+; CHECKFP-NEXT:    vmovx.f16 s20, s3
+; CHECKFP-NEXT:    vins.f16 s1, s3
+; CHECKFP-NEXT:    vmovx.f16 s3, s10
 ; CHECKFP-NEXT:    vins.f16 s16, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s7
 ; CHECKFP-NEXT:    vmovx.f16 s17, s5
-; CHECKFP-NEXT:    vins.f16 s13, s15
-; CHECKFP-NEXT:    vins.f16 s17, s20
-; CHECKFP-NEXT:    vmovx.f16 s22, s8
-; CHECKFP-NEXT:    vins.f16 s22, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s14
+; CHECKFP-NEXT:    vmovx.f16 s20, s7
+; CHECKFP-NEXT:    vins.f16 s22, s3
 ; CHECKFP-NEXT:    vmovx.f16 s23, s12
-; CHECKFP-NEXT:    vins.f16 s1, s3
-; CHECKFP-NEXT:    vins.f16 s23, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s2
+; CHECKFP-NEXT:    vmovx.f16 s3, s14
+; CHECKFP-NEXT:    vins.f16 s17, s20
+; CHECKFP-NEXT:    vins.f16 s23, s3
 ; CHECKFP-NEXT:    vmovx.f16 s20, s0
+; CHECKFP-NEXT:    vmovx.f16 s3, s2
+; CHECKFP-NEXT:    vins.f16 s9, s11
+; CHECKFP-NEXT:    vins.f16 s13, s15
 ; CHECKFP-NEXT:    vins.f16 s5, s7
-; CHECKFP-NEXT:    vins.f16 s20, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s6
+; CHECKFP-NEXT:    vins.f16 s20, s3
 ; CHECKFP-NEXT:    vmovx.f16 s21, s4
+; CHECKFP-NEXT:    vmovx.f16 s3, s6
 ; CHECKFP-NEXT:    vins.f16 s8, s10
-; CHECKFP-NEXT:    vins.f16 s21, s24
-; CHECKFP-NEXT:    vmov.f32 s26, s9
 ; CHECKFP-NEXT:    vins.f16 s12, s14
-; CHECKFP-NEXT:    vins.f16 s0, s2
-; CHECKFP-NEXT:    vmov.f32 s27, s13
 ; CHECKFP-NEXT:    vins.f16 s4, s6
+; CHECKFP-NEXT:    vins.f16 s21, s3
+; CHECKFP-NEXT:    vins.f16 s0, s2
 ; CHECKFP-NEXT:    vmov.f32 s24, s1
+; CHECKFP-NEXT:    vmov.f32 s26, s9
+; CHECKFP-NEXT:    vmov.f32 s27, s13
+; CHECKFP-NEXT:    vmov.f32 s25, s5
 ; CHECKFP-NEXT:    vmov.f32 s2, s8
+; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
 ; CHECKFP-NEXT:    vmov.f32 s3, s12
 ; CHECKFP-NEXT:    vmov.f32 s1, s4
-; CHECKFP-NEXT:    vmov.f32 s25, s5
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
-; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
 ; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    bx lr
@@ -1495,8 +1491,8 @@ define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
 ; CHECK-LABEL: shuffle2_f64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -1559,7 +1555,6 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) {
 ; CHECK-LABEL: insert_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <4 x float> undef, float %a, i32 0
@@ -1569,7 +1564,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) {
 ; CHECK-LABEL: insert_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <8 x half> undef, half %a, i32 0
@@ -1579,7 +1573,6 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) {
 ; CHECK-LABEL: insert_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <2 x double> undef, double %a, i32 0
@@ -1696,7 +1689,6 @@ entry:
 define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) {
 ; CHECK-LABEL: extract_f32_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <4 x float> %a, i32 0
@@ -1716,7 +1708,6 @@ entry:
 define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
 ; CHECK-LABEL: extract_f16_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <8 x half> %a, i32 0
@@ -1736,7 +1727,6 @@ entry:
 define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) {
 ; CHECK-LABEL: extract_f64_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <2 x double> %a, i32 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
index 043f7d9576a3b..b487407eefa5a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
@@ -52,10 +52,10 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
 ; CHECK-NEXT:    add sp, #16
@@ -94,9 +94,9 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
@@ -126,17 +126,17 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    add r1, sp, #16
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT:    vldrh.s32 q3, [r1]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.f32 s4, s13
-; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
@@ -195,10 +195,10 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
 ; CHECK-NEXT:    add sp, #16
@@ -237,9 +237,9 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vldrh.u32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
@@ -269,17 +269,17 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    add r1, sp, #16
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vldrh.u32 q3, [r1]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vldrh.u32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.f32 s4, s13
-; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vldrh.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index ce08e69b6816c..7318ec8077deb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -37,12 +37,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -340,12 +340,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x hal
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index a71adb8a655d1..d145b6a61737b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -56,11 +56,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: add_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vadd.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vadd.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vadd.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vadd.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vadd.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vadd.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vadd.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: add_float32_t:
@@ -75,27 +74,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: add_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vadd.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vadd.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vadd.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vadd.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vadd.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: add_float16_t:
@@ -189,11 +187,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: sub_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vsub.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vsub.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vsub.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vsub.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vsub.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vsub.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vsub.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vsub.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: sub_float32_t:
@@ -208,27 +205,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: sub_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vsub.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vsub.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vsub.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vsub.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vsub.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: sub_float16_t:
@@ -324,27 +320,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: mul_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmul.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vmul.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vmul.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vmul.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vmul.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: mul_float16_t:
@@ -359,11 +354,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: mul_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmul.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vmul.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vmul.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vmul.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmul.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vmul.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vmul.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: mul_float32_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index 488a20bc9602f..4b76906034057 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -6,10 +6,10 @@
 define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i8:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -17,9 +17,9 @@ define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i8:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i8 q0, q1, q0
@@ -35,10 +35,10 @@ entry:
 define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -46,9 +46,9 @@ define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i16 q0, q1, q0
@@ -64,10 +64,10 @@ entry:
 define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -75,9 +75,9 @@ define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
@@ -144,10 +144,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-MVE-NEXT:    vmov d9, r2, r3
 ; CHECK-MVE-NEXT:    vmov d8, r0, r1
 ; CHECK-MVE-NEXT:    add r0, sp, #64
 ; CHECK-MVE-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-MVE-NEXT:    vmov d9, r2, r3
 ; CHECK-MVE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q6[0]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -239,13 +239,13 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    add r0, sp, #64
 ; CHECK-BE-NEXT:    vldrh.u16 q6, [r0]
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q4, q0
-; CHECK-BE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-BE-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-BE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-BE-NEXT:    bl __aeabi_h2f
 ; CHECK-BE-NEXT:    mov r5, r0
 ; CHECK-BE-NEXT:    mov r0, r4
@@ -332,10 +332,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ;
 ; CHECK-FP-LABEL: vector_add_f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vmov d0, r0, r1
 ; CHECK-FP-NEXT:    mov r0, sp
 ; CHECK-FP-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1
@@ -352,21 +352,21 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-MVE-NEXT:    .pad #4
 ; CHECK-MVE-NEXT:    sub sp, #4
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    mov r4, r0
-; CHECK-MVE-NEXT:    add r0, sp, #56
-; CHECK-MVE-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-MVE-NEXT:    add r0, sp, #40
+; CHECK-MVE-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-MVE-NEXT:    mov r6, r1
 ; CHECK-MVE-NEXT:    mov r0, r3
 ; CHECK-MVE-NEXT:    mov r5, r2
-; CHECK-MVE-NEXT:    vmov r7, r1, d11
+; CHECK-MVE-NEXT:    vmov r7, r1, d9
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s19, r0
 ; CHECK-MVE-NEXT:    mov r0, r5
 ; CHECK-MVE-NEXT:    mov r1, r7
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov r5, r1, d10
+; CHECK-MVE-NEXT:    vmov r5, r1, d8
 ; CHECK-MVE-NEXT:    vmov s18, r0
 ; CHECK-MVE-NEXT:    mov r0, r6
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
@@ -377,7 +377,7 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-NEXT:    vmov s16, r0
 ; CHECK-MVE-NEXT:    vmov r2, r3, d9
 ; CHECK-MVE-NEXT:    vmov r0, r1, d8
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    add sp, #4
 ; CHECK-MVE-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
@@ -385,23 +385,23 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
-; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vmov d1, r3, r2
+; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
-; CHECK-BE-NEXT:    add r1, sp, #64
-; CHECK-BE-NEXT:    vldrw.u32 q6, [r1]
-; CHECK-BE-NEXT:    vrev64.32 q5, q0
-; CHECK-BE-NEXT:    vmov r4, r0, d11
-; CHECK-BE-NEXT:    vmov r5, r1, d13
+; CHECK-BE-NEXT:    add r1, sp, #48
+; CHECK-BE-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-BE-NEXT:    vmov d1, r3, r2
+; CHECK-BE-NEXT:    vrev64.32 q4, q0
+; CHECK-BE-NEXT:    vmov r4, r0, d9
+; CHECK-BE-NEXT:    vmov r5, r1, d11
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s19, r0
 ; CHECK-BE-NEXT:    mov r0, r4
 ; CHECK-BE-NEXT:    mov r1, r5
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s18, r0
-; CHECK-BE-NEXT:    vmov r4, r0, d10
-; CHECK-BE-NEXT:    vmov r5, r1, d12
+; CHECK-BE-NEXT:    vmov r4, r0, d8
+; CHECK-BE-NEXT:    vmov r5, r1, d10
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s17, r0
 ; CHECK-BE-NEXT:    mov r0, r4
@@ -411,15 +411,15 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-BE-NEXT:    vrev64.32 q0, q4
 ; CHECK-BE-NEXT:    vmov r1, r0, d0
 ; CHECK-BE-NEXT:    vmov r3, r2, d1
-; CHECK-BE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-BE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vmov d0, r0, r1
 ; CHECK-FP-NEXT:    mov r0, sp
 ; CHECK-FP-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index df14b59f9934d..56f95b2218378 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -186,8 +186,8 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
@@ -195,12 +195,13 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov.f32 s18, s9
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s8, s10
 ; CHECK-NEXT:    vmov r7, s18
 ; CHECK-NEXT:    asrs r4, r3, #31
 ; CHECK-NEXT:    subs.w r8, r3, r5
@@ -209,24 +210,21 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r5, #0, #4
 ; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    subs.w r9, r5, r7
 ; CHECK-NEXT:    asr.w r6, r5, #31
-; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
 ; CHECK-NEXT:    and.w r6, r12, r6, asr #31
 ; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    bfi r4, r6, #4, #4
-; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    asrs r7, r6, #31
 ; CHECK-NEXT:    subs.w r10, r6, r3
-; CHECK-NEXT:    asr.w r7, r6, #31
+; CHECK-NEXT:    asr.w r6, r5, #31
 ; CHECK-NEXT:    sbc.w r3, r7, r3, asr #31
-; CHECK-NEXT:    vmov r7, s4
-; CHECK-NEXT:    asrs r6, r5, #31
+; CHECK-NEXT:    vmov r7, s8
 ; CHECK-NEXT:    asr.w r11, r3, #31
 ; CHECK-NEXT:    and.w r3, r12, r3, asr #31
 ; CHECK-NEXT:    rsbs r3, r3, #0
@@ -247,7 +245,7 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
@@ -390,7 +388,13 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    vand q5, q5, q0
+; CHECK-NEXT:    vmov.f32 s8, s10
 ; CHECK-NEXT:    vmov r5, r6, d10
+; CHECK-NEXT:    vmov.f32 s10, s11
+; CHECK-NEXT:    vmov.f32 s12, s14
+; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vmov.f32 s14, s15
+; CHECK-NEXT:    vand q3, q3, q0
 ; CHECK-NEXT:    subs.w r8, r5, r3
 ; CHECK-NEXT:    vmov r7, r3, d11
 ; CHECK-NEXT:    sbc.w r4, r6, r4
@@ -398,12 +402,6 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r5, #0, #4
 ; CHECK-NEXT:    vmov r5, r6, d9
-; CHECK-NEXT:    vmov.f32 s16, s10
-; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vand q2, q4, q0
-; CHECK-NEXT:    vmov.f32 s16, s14
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vand q3, q4, q0
 ; CHECK-NEXT:    subs.w r9, r7, r5
 ; CHECK-NEXT:    mov.w r7, #1
 ; CHECK-NEXT:    sbcs r3, r6

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
index a94079d659216..43a3d1b049acf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
@@ -759,8 +759,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -769,15 +769,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -785,76 +785,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -871,8 +870,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -884,16 +883,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -901,11 +900,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -913,11 +911,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -925,14 +923,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -940,11 +939,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -952,14 +951,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -967,10 +966,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -979,10 +978,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -1000,8 +998,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1010,15 +1008,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1026,76 +1024,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1112,8 +1109,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1122,15 +1119,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1138,76 +1135,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1224,8 +1220,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1234,15 +1230,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1250,76 +1246,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1336,8 +1331,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1346,15 +1341,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1362,76 +1357,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1448,8 +1442,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1461,16 +1455,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1478,11 +1472,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1490,11 +1483,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1502,14 +1495,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1517,11 +1511,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1529,14 +1523,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1544,10 +1538,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1556,10 +1550,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1577,8 +1570,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1587,15 +1580,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1603,76 +1596,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1689,8 +1681,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1699,15 +1691,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1715,76 +1707,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1801,8 +1792,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1811,15 +1802,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1827,76 +1818,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1913,8 +1903,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1923,15 +1913,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1939,76 +1929,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -2025,8 +2014,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2035,15 +2024,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2051,76 +2040,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2137,8 +2125,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2147,15 +2135,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2163,76 +2151,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2250,8 +2237,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2260,15 +2247,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2276,76 +2263,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
index e47207594a7ac..d90688d43d6e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -801,8 +801,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -822,12 +820,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -839,17 +837,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -861,17 +859,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -882,17 +880,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -912,8 +908,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -937,12 +931,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -956,19 +950,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -982,19 +976,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1007,7 +1001,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -1016,10 +1010,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -1040,8 +1032,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1061,12 +1051,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1078,17 +1068,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1100,17 +1090,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1121,17 +1111,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1151,8 +1139,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1172,12 +1158,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1189,17 +1175,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1211,17 +1197,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1232,17 +1218,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1262,8 +1246,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1283,12 +1265,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1300,17 +1282,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1322,17 +1304,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1343,17 +1325,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1373,8 +1353,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1394,12 +1372,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1411,17 +1389,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1433,17 +1411,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1454,17 +1432,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1484,8 +1460,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1509,12 +1483,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1528,19 +1502,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1554,19 +1528,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1579,7 +1553,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1588,10 +1562,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1612,8 +1584,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1633,12 +1603,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1650,17 +1620,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1672,17 +1642,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1693,17 +1663,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1723,8 +1691,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1744,12 +1710,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1761,17 +1727,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1783,17 +1749,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1804,17 +1770,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1834,8 +1798,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1855,12 +1817,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1872,17 +1834,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1894,17 +1856,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1915,17 +1877,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1945,8 +1905,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1966,12 +1924,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1983,17 +1941,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2005,17 +1963,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2026,17 +1984,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -2056,8 +2012,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2077,12 +2031,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2094,17 +2048,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2116,17 +2070,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2137,17 +2091,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2167,8 +2119,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2188,12 +2138,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2205,17 +2155,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2227,17 +2177,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2248,17 +2198,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2279,8 +2227,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2300,12 +2246,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2317,17 +2263,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2339,17 +2285,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2360,17 +2306,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
@@ -3190,8 +3134,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3211,12 +3153,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3228,17 +3170,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3250,17 +3192,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3271,17 +3213,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
@@ -3301,8 +3241,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3326,12 +3264,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3345,19 +3283,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3371,19 +3309,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3396,7 +3334,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -3405,10 +3343,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
@@ -3429,8 +3365,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3450,12 +3384,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3467,17 +3401,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3489,17 +3423,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3510,17 +3444,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
@@ -3540,8 +3472,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3561,12 +3491,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3578,17 +3508,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3600,17 +3530,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3621,17 +3551,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
@@ -3651,8 +3579,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3672,12 +3598,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3689,17 +3615,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3711,17 +3637,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3732,17 +3658,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
@@ -3762,8 +3686,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3783,12 +3705,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3800,17 +3722,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3822,17 +3744,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3843,17 +3765,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
@@ -3873,8 +3793,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3898,12 +3816,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3917,19 +3835,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3943,19 +3861,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3968,7 +3886,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -3977,10 +3895,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
@@ -4001,8 +3917,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4022,12 +3936,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4039,17 +3953,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4061,17 +3975,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4082,17 +3996,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
@@ -4112,8 +4024,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4133,12 +4043,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4150,17 +4060,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4172,17 +4082,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4193,17 +4103,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
@@ -4223,8 +4131,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4244,12 +4150,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4261,17 +4167,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4283,17 +4189,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4304,17 +4210,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
@@ -4334,8 +4238,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4355,12 +4257,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4372,17 +4274,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4394,17 +4296,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4415,17 +4317,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
@@ -4445,8 +4345,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4466,12 +4364,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4483,17 +4381,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4505,17 +4403,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4526,17 +4424,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
@@ -4556,8 +4452,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4577,12 +4471,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4594,17 +4488,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4616,17 +4510,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4637,17 +4531,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
@@ -4668,8 +4560,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4689,12 +4579,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4706,17 +4596,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4728,17 +4618,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4749,17 +4639,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:
@@ -4782,8 +4670,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -4803,12 +4689,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4820,17 +4706,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4842,17 +4728,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4863,17 +4749,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
index eee5c5b249e1a..33231783c5e69 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -759,8 +759,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -774,43 +772,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -819,20 +817,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -840,17 +838,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -867,8 +863,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -884,7 +878,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -892,15 +886,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -908,25 +900,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -937,22 +931,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -962,7 +956,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -971,10 +965,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -992,8 +984,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1007,43 +997,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -1052,20 +1042,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -1073,17 +1063,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1100,8 +1088,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1115,43 +1101,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -1160,20 +1146,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -1181,17 +1167,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1208,8 +1192,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1223,43 +1205,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1268,20 +1250,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1289,17 +1271,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1316,8 +1296,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1331,43 +1309,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -1376,20 +1354,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -1397,17 +1375,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1424,8 +1400,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1441,7 +1415,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1449,15 +1423,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1465,25 +1437,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1494,22 +1468,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1519,7 +1493,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1528,10 +1502,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1549,8 +1521,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1564,43 +1534,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -1609,20 +1579,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -1630,17 +1600,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1657,8 +1625,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1672,43 +1638,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -1717,20 +1683,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -1738,17 +1704,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1765,8 +1729,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1780,43 +1742,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -1825,20 +1787,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -1846,17 +1808,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1873,8 +1833,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1888,43 +1846,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -1933,20 +1891,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -1954,17 +1912,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -1981,8 +1937,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1996,43 +1950,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -2041,20 +1995,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -2062,17 +2016,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2089,8 +2041,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -2104,43 +2054,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -2149,20 +2099,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -2170,17 +2120,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2198,8 +2146,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -2213,43 +2159,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -2258,20 +2204,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -2279,17 +2225,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
@@ -3064,8 +3008,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3079,43 +3021,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3124,20 +3066,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3145,17 +3087,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
@@ -3172,8 +3112,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3189,7 +3127,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3197,15 +3135,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3213,25 +3149,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3242,22 +3180,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3267,7 +3205,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -3276,10 +3214,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
@@ -3297,8 +3233,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3312,43 +3246,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3357,20 +3291,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3378,17 +3312,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
@@ -3405,8 +3337,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3420,43 +3350,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -3465,20 +3395,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -3486,17 +3416,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
@@ -3513,8 +3441,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3528,43 +3454,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -3573,20 +3499,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -3594,17 +3520,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
@@ -3621,8 +3545,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3636,43 +3558,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -3681,20 +3603,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -3702,17 +3624,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
@@ -3729,8 +3649,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3746,7 +3664,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3754,15 +3672,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3770,25 +3686,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3799,22 +3717,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3824,7 +3742,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -3833,10 +3751,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
@@ -3854,8 +3770,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3869,43 +3783,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -3914,20 +3828,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -3935,17 +3849,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
@@ -3962,8 +3874,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3977,43 +3887,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -4022,20 +3932,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -4043,17 +3953,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
@@ -4070,8 +3978,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4085,43 +3991,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -4130,20 +4036,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -4151,17 +4057,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
@@ -4178,8 +4082,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4193,43 +4095,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -4238,20 +4140,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -4259,17 +4161,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
@@ -4286,8 +4186,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4301,43 +4199,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -4346,20 +4244,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -4367,17 +4265,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
@@ -4394,8 +4290,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -4409,43 +4303,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -4454,20 +4348,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -4475,17 +4369,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
@@ -4503,8 +4395,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -4518,43 +4408,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -4563,20 +4453,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -4584,17 +4474,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 9b1175fabce3b..84a9e0145f0c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @foo_float_int32(<4 x i32> %src) {
 ; CHECK-MVE-LABEL: foo_float_int32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s7, s3
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s5, s1
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s3, s3
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s1, s1
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_float_int32:
@@ -24,11 +23,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @foo_float_uint32(<4 x i32> %src) {
 ; CHECK-MVE-LABEL: foo_float_uint32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s7, s3
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s5, s1
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s3, s3
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s1, s1
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_float_uint32:
@@ -43,15 +41,15 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_int32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s2
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s1
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s3
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s1
+; CHECK-MVE-NEXT:    vmov r0, s2
+; CHECK-MVE-NEXT:    vmov r1, s0
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s4
 ; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
 ; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -67,15 +65,15 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_uint32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s2
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s0
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s8, s3
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s1
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s3
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s1
+; CHECK-MVE-NEXT:    vmov r0, s2
+; CHECK-MVE-NEXT:    vmov r1, s0
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s4
 ; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
 ; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -96,28 +94,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) {
 ; CHECK-MVE-NEXT:    vmov s0, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[1]
 ; CHECK-MVE-NEXT:    vmov s2, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s2
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[2]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s8
+; CHECK-MVE-NEXT:    vins.f16 s1, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[5]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
+; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s10, s10
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vins.f16 s2, s10
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vmov s4, r0
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
@@ -139,28 +137,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) {
 ; CHECK-MVE-NEXT:    vmov s0, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-MVE-NEXT:    vmov s2, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s2
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s8
+; CHECK-MVE-NEXT:    vins.f16 s1, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s10, s10
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vins.f16 s2, s10
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vmov s4, r0
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
@@ -177,15 +175,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) {
 ; CHECK-MVE-LABEL: foo_int16_half:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s0
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-MVE-NEXT:    vmov r0, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s5, s1
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r0
 ; CHECK-MVE-NEXT:    vmov r0, s14
@@ -219,15 +217,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) {
 ; CHECK-MVE-LABEL: foo_uint16_half:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s0
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-MVE-NEXT:    vmov r0, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s5, s1
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r0
 ; CHECK-MVE-NEXT:    vmov r0, s14
@@ -355,14 +353,13 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc1(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: vmovn32_trunc1:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s8
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s4
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s9
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s1
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s3
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s5
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s10
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s6
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s11
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -380,15 +377,14 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: vmovn32_trunc2:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s4
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s5
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s6
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s7
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s1
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s3
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s4
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s5
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s6
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s7
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vmovn32_trunc2:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
index 98ee5fdd3f34e..844e39e2964bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
@@ -4,11 +4,10 @@
 define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
 ; CHECK-LABEL: fpext_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fpext <4 x half> %src1 to <4 x float>
@@ -19,12 +18,12 @@ define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
 ; CHECK-LABEL: fpext_8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
 ; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
 ; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
 ; CHECK-NEXT:    vcvtb.f32.f16 s4, s2
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
@@ -37,11 +36,10 @@ entry:
 define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
 ; CHECK-LABEL: fptrunc_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtb.f16.f32 s4, s0
-; CHECK-NEXT:    vcvtt.f16.f32 s4, s1
-; CHECK-NEXT:    vcvtb.f16.f32 s5, s2
-; CHECK-NEXT:    vcvtt.f16.f32 s5, s3
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s2
+; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
+; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fptrunc <4 x float> %src1 to <4 x half>
@@ -51,15 +49,14 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
 ; CHECK-LABEL: fptrunc_8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vcvtb.f16.f32 s0, s8
-; CHECK-NEXT:    vcvtt.f16.f32 s0, s9
-; CHECK-NEXT:    vcvtb.f16.f32 s1, s10
-; CHECK-NEXT:    vcvtt.f16.f32 s1, s11
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s2
 ; CHECK-NEXT:    vcvtb.f16.f32 s2, s4
+; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
+; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
 ; CHECK-NEXT:    vcvtt.f16.f32 s2, s5
-; CHECK-NEXT:    vcvtb.f16.f32 s3, s6
 ; CHECK-NEXT:    vcvtt.f16.f32 s3, s7
+; CHECK-NEXT:    vcvtb.f16.f32 s4, s6
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fptrunc <8 x float> %src1 to <8 x half>
@@ -247,12 +244,12 @@ define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) {
 ; CHECK-NEXT:    vld20.16 {q2, q3}, [r0]
 ; CHECK-NEXT:    vld21.16 {q2, q3}, [r0]
 ; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
 ; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
 ; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
 ; CHECK-NEXT:    vcvtb.f32.f16 s4, s10
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index 0f7393c85d670..f444ec4ef1e94 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -136,7 +136,6 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) {
 ; CHECK-LABEL: vdup_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    vmov.f32 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
index c96baf10bc607..bd2aa4be5fab7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
@@ -16,17 +16,17 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fadd_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vadd.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vadd.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fadd_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v8f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vadd.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vadd.f32 s14, s2, s6
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vadd.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f32 s2, s10, s14
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vadd.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vadd.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -63,8 +63,8 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -75,21 +75,21 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fadd_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vadd.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vadd.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vadd.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v16f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s10, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -196,9 +196,9 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d5, d1, d3
+; CHECK-NEXT:    vadd.f64 d1, d1, d3
 ; CHECK-NEXT:    vadd.f64 d0, d0, d2
-; CHECK-NEXT:    vadd.f64 d0, d0, d5
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    vadd.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -209,8 +209,8 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v2f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s4, s0
-; CHECK-NEXT:    vadd.f32 s0, s4, s1
+; CHECK-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
@@ -220,10 +220,10 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v4f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s4, s0
-; CHECK-NEXT:    vadd.f32 s4, s4, s1
-; CHECK-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
@@ -233,10 +233,10 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v8f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s8, s8, s0
-; CHECK-NEXT:    vadd.f32 s8, s8, s1
-; CHECK-NEXT:    vadd.f32 s8, s8, s2
-; CHECK-NEXT:    vadd.f32 s0, s8, s3
+; CHECK-NEXT:    vadd.f32 s0, s8, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NEXT:    vadd.f32 s0, s0, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s6
@@ -250,12 +250,12 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v4f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NEXT:    vadd.f16 s2, s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
@@ -266,17 +266,17 @@ define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v8f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vadd.f16 s4, s4, s2
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vadd.f16 s4, s4, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
@@ -287,18 +287,18 @@ define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v16f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f16 s8, s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vadd.f16 s8, s8, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vadd.f16 s8, s8, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vadd.f16 s8, s8, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vadd.f16 s0, s8, s0
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s8
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vadd.f16 s0, s0, s4
 ; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s5
@@ -329,8 +329,8 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v2f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d2, d2, d0
-; CHECK-NEXT:    vadd.f64 d0, d2, d1
+; CHECK-NEXT:    vadd.f64 d0, d2, d0
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
@@ -340,8 +340,8 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d4, d4, d0
-; CHECK-NEXT:    vadd.f64 d0, d4, d1
+; CHECK-NEXT:    vadd.f64 d0, d4, d0
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    vadd.f64 d0, d0, d2
 ; CHECK-NEXT:    vadd.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index 07a0077b09301..7cafb7262f460 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -15,16 +15,16 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
@@ -36,9 +36,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32:
@@ -49,15 +49,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s8, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s12
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
@@ -67,20 +67,20 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -92,24 +92,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -122,9 +122,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16:
@@ -132,42 +132,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vselgt.f16 s8, s10, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s5, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -199,10 +199,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vcmp.f64 d3, d1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d2, d0
-; CHECK-NEXT:    vselgt.f64 d4, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d4
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
@@ -222,16 +222,16 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v4f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
@@ -242,20 +242,20 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v8f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s8, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f32 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f32 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
@@ -265,20 +265,20 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -290,24 +290,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -320,36 +320,36 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -378,9 +378,9 @@ entry:
 define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) {
 ; CHECK-LABEL: fmin_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f64 d4, d1, d3
+; CHECK-NEXT:    vminnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d4
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
@@ -403,17 +403,17 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v4f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -427,9 +427,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -441,15 +441,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s10, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f32 s14, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s14
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -462,21 +462,21 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_acc(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmin_v4f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vminnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -489,8 +489,8 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v2f16_acc(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmin_v2f16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -505,25 +505,25 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vminnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -539,9 +539,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -550,42 +550,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s5, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s14
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s14
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -626,10 +626,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vcmp.f64 d3, d1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d2, d0
-; CHECK-NEXT:    vselgt.f64 d5, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d5
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vminnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -657,9 +657,9 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s4, s0
@@ -667,9 +667,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vcmp.f32 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
@@ -685,9 +685,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s0, s8
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s8, s0
@@ -695,13 +695,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s12, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s10, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f32 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
@@ -716,11 +716,11 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vminnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -728,11 +728,11 @@ define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -749,9 +749,9 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vminnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -760,16 +760,16 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -787,9 +787,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s8, s0
@@ -797,29 +797,29 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s8, s0
@@ -863,9 +863,9 @@ entry:
 define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmin_v4f64_acc_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f64 d5, d1, d3
+; CHECK-NEXT:    vminnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d5
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vcmp.f64 d0, d4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
@@ -890,16 +890,16 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
@@ -910,9 +910,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32:
@@ -923,15 +923,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s8, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f32 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
@@ -941,20 +941,20 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -966,24 +966,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -996,9 +996,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16:
@@ -1006,42 +1006,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vselgt.f16 s8, s10, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s1, s5
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
@@ -1073,10 +1073,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vcmp.f64 d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d0, d2
-; CHECK-NEXT:    vselgt.f64 d4, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
@@ -1096,16 +1096,16 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v4f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
@@ -1116,20 +1116,20 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v8f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
@@ -1139,20 +1139,20 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -1164,24 +1164,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -1194,36 +1194,36 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s8, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
@@ -1252,9 +1252,9 @@ entry:
 define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) {
 ; CHECK-LABEL: fmax_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f64 d4, d1, d3
+; CHECK-NEXT:    vmaxnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
@@ -1277,17 +1277,17 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v4f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1301,9 +1301,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -1315,15 +1315,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s10, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f32 s14, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s14
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1336,8 +1336,8 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v2f16_acc(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmax_v2f16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1350,21 +1350,21 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_acc(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmax_v4f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1379,25 +1379,25 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1413,9 +1413,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -1424,42 +1424,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s1, s5
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s14, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s14, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1500,10 +1500,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vcmp.f64 d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d0, d2
-; CHECK-NEXT:    vselgt.f64 d5, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d5
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vmaxnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1531,9 +1531,9 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s4, s0
@@ -1541,9 +1541,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vcmp.f32 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
@@ -1559,9 +1559,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s8, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s8, s0
@@ -1569,13 +1569,13 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s12, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f32 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
@@ -1590,11 +1590,11 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1602,11 +1602,11 @@ define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1623,9 +1623,9 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1634,16 +1634,16 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1661,9 +1661,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s8, s0
@@ -1671,29 +1671,29 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s8, s0
@@ -1737,9 +1737,9 @@ entry:
 define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmax_v4f64_acc_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f64 d5, d1, d3
+; CHECK-NEXT:    vmaxnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d5
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vcmp.f64 d4, d0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
index bbc30d99d10c6..b847b05f566f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
@@ -16,17 +16,17 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmul_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmul.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmul.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmul.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmul_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmul.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmul.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v8f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmul.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmul.f32 s14, s2, s6
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmul.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f32 s2, s10, s14
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmul.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmul.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -63,8 +63,8 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -75,21 +75,21 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmul_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmul.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmul.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmul.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v16f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s10, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s2, s6
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -196,9 +196,9 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d5, d1, d3
+; CHECK-NEXT:    vmul.f64 d1, d1, d3
 ; CHECK-NEXT:    vmul.f64 d0, d0, d2
-; CHECK-NEXT:    vmul.f64 d0, d0, d5
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    vmul.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -209,8 +209,8 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v2f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s4, s4, s0
-; CHECK-NEXT:    vmul.f32 s0, s4, s1
+; CHECK-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
@@ -220,10 +220,10 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v4f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s4, s4, s0
-; CHECK-NEXT:    vmul.f32 s4, s4, s1
-; CHECK-NEXT:    vmul.f32 s4, s4, s2
-; CHECK-NEXT:    vmul.f32 s0, s4, s3
+; CHECK-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
@@ -233,10 +233,10 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v8f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s8, s8, s0
-; CHECK-NEXT:    vmul.f32 s8, s8, s1
-; CHECK-NEXT:    vmul.f32 s8, s8, s2
-; CHECK-NEXT:    vmul.f32 s0, s8, s3
+; CHECK-NEXT:    vmul.f32 s0, s8, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NEXT:    vmul.f32 s0, s0, s5
 ; CHECK-NEXT:    vmul.f32 s0, s0, s6
@@ -250,9 +250,9 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v2f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f16 s4, s4, s0
+; CHECK-NEXT:    vmul.f16 s2, s4, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s0
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmul.f16 s0, s2, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
@@ -262,12 +262,12 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v4f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmul.f16 s2, s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
@@ -278,17 +278,17 @@ define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v8f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmul.f16 s4, s4, s2
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmul.f16 s4, s4, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
@@ -299,18 +299,18 @@ define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v16f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f16 s8, s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmul.f16 s8, s8, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vmul.f16 s8, s8, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmul.f16 s8, s8, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vmul.f16 s0, s8, s0
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s8
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s5
@@ -341,8 +341,8 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v2f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d2, d2, d0
-; CHECK-NEXT:    vmul.f64 d0, d2, d1
+; CHECK-NEXT:    vmul.f64 d0, d2, d0
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
@@ -352,8 +352,8 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d4, d4, d0
-; CHECK-NEXT:    vmul.f64 d0, d4, d1
+; CHECK-NEXT:    vmul.f64 d0, d4, d0
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    vmul.f64 d0, d0, d2
 ; CHECK-NEXT:    vmul.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 7bcc0193217d3..f5adcf0427649 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -503,10 +503,10 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB5_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    beq .LBB5_9
 ; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2
@@ -601,10 +601,10 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB6_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmul.f32 s4, s2, s3
+; CHECK-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-NEXT:    vmul.f32 s0, s0, s4
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-NEXT:    beq .LBB6_9
 ; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2
@@ -1464,9 +1464,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB15_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB15_9
 ; CHECK-NEXT:  .LBB15_7: @ %for.body.preheader1
@@ -1567,9 +1567,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB16_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB16_9
 ; CHECK-NEXT:  .LBB16_7: @ %for.body.preheader1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
index d06a5418c70da..bf966ee17b7e6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
@@ -54,17 +54,17 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vrhadd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
@@ -77,27 +77,26 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    adc.w r3, r2, r3, asr #31
 ; CHECK-NEXT:    adds r2, r1, #1
 ; CHECK-NEXT:    adc r1, r3, #0
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    adds r0, #1
 ; CHECK-NEXT:    adc r1, r1, #0
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    adc.w r3, r2, r3, asr #31
 ; CHECK-NEXT:    adds r2, r1, #1
 ; CHECK-NEXT:    adc r1, r3, #0
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -153,17 +152,17 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vhadd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
@@ -173,22 +172,21 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    asr.w r12, r1, #31
 ; CHECK-NEXT:    adc.w r1, r12, r3, asr #31
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r2, r1, r3
 ; CHECK-NEXT:    asr.w r12, r1, #31
 ; CHECK-NEXT:    adc.w r1, r12, r3, asr #31
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -255,10 +253,10 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vand q2, q2, q4
+; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vand q3, q3, q4
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov r2, r3, d6
@@ -356,10 +354,10 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vand q2, q2, q4
+; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vand q3, q3, q4
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov r2, r3, d6
@@ -498,23 +496,23 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:  .LBB14_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    asrs r4, r3, #31
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    asr.w r4, r3, #31
 ; CHECK-NEXT:    adc.w r3, r4, r5, asr #31
 ; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    lsrl r12, r3, #1
@@ -523,24 +521,24 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    asr.w r4, r3, #31
 ; CHECK-NEXT:    adc.w r3, r4, r5, asr #31
 ; CHECK-NEXT:    lsrl r6, r3, #1
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov q4[2], q4[0], r6, r12
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r12
 ; CHECK-NEXT:    adds r4, r3, r5
 ; CHECK-NEXT:    asr.w r6, r3, #31
 ; CHECK-NEXT:    adc.w r3, r6, r5, asr #31
 ; CHECK-NEXT:    lsrl r4, r3, #1
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    adds r6, r3, r5
 ; CHECK-NEXT:    asr.w r12, r3, #31
 ; CHECK-NEXT:    adc.w r3, r12, r5, asr #31
 ; CHECK-NEXT:    lsrl r6, r3, #1
-; CHECK-NEXT:    vmov q4[3], q4[1], r6, r4
-; CHECK-NEXT:    vstrb.8 q4, [r2], #16
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r4
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB14_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   br label %vector.body
@@ -677,10 +675,10 @@ define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r1], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r5, d2
 ; CHECK-NEXT:    vmov r4, r6, d4
@@ -859,10 +857,10 @@ define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r12, d2
 ; CHECK-NEXT:    vmov r4, r5, d4
@@ -1049,10 +1047,10 @@ define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r12, d2
 ; CHECK-NEXT:    vmov r4, r5, d4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
index 6bd3ee578b89c..693afc151c796 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
@@ -70,15 +70,13 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov r4, r7, d4
+; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r5, d0
-; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r4, r7, d4
 ; CHECK-NEXT:    vmov r3, r6, d1
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r6, r6, r12

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index 93967f052b0aa..f9c4965d05ca1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -77,12 +77,10 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-NEXT:    vld20.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    vld20.32 {q1, q2}, [r3]
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4
-; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vadd.i32 q3, q3, q4
+; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vld21.32 {q1, q2}, [r3]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2
 ; CHECK-NEXT:    vadd.i32 q5, q5, q6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
@@ -102,14 +100,14 @@ entry:
 define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld2_v4i32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s5
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vadd.i32 q0, q1, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -207,25 +205,25 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s12, s11
 ; CHECK-NEXT:    vmovx.f16 s7, s10
-; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vins.f16 s7, s12
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -324,15 +322,13 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r5, r6, d0
-; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r0, r4, d4
 ; CHECK-NEXT:    vmov r3, r2, d1
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r2, r2, r12
@@ -356,34 +352,30 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s17, s11
-; CHECK-NEXT:    vmov.f32 s18, s14
-; CHECK-NEXT:    vmov.f32 s10, s12
-; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r5, r6, d4
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov.f32 s19, s15
-; CHECK-NEXT:    vmov.f32 s11, s13
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
 ; CHECK-NEXT:    vmov r0, r7, d8
-; CHECK-NEXT:    vmov r5, r6, d4
 ; CHECK-NEXT:    adds.w lr, lr, r2
 ; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, r4, d9
+; CHECK-NEXT:    vmov r3, r4, d7
 ; CHECK-NEXT:    adds r0, r0, r5
 ; CHECK-NEXT:    adc.w r8, r6, r7
-; CHECK-NEXT:    vmov r6, r5, d5
+; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    vmov r2, r7, d0
 ; CHECK-NEXT:    adds r3, r3, r6
 ; CHECK-NEXT:    adc.w r6, r5, r4
@@ -396,7 +388,7 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adc.w r0, r7, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 8
@@ -480,12 +472,10 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) {
 ; CHECK-NEXT:    vld20.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    vld20.32 {q1, q2}, [r3]
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4
-; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vadd.f32 q3, q3, q4
+; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vld21.32 {q1, q2}, [r3]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2
 ; CHECK-NEXT:    vadd.f32 q5, q5, q6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vadd.f32 q1, q1, q2
@@ -505,14 +495,14 @@ entry:
 define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld2_v4f32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s5
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vadd.f32 q0, q1, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -535,11 +525,11 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    ldr r0, [r0, #4]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s4, s2
 ; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -556,14 +546,14 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vins.f16 s2, s3
-; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r2, d0
@@ -620,25 +610,25 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s12, s11
 ; CHECK-NEXT:    vmovx.f16 s7, s10
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vins.f16 s10, s11
-; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vins.f16 s7, s12
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index b998d62b0d9c6..bf76ba3a513ca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -10,7 +10,6 @@ define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d2, d0
 ; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r12, lr, d0
 ; CHECK-NEXT:    vmov r3, s6
@@ -37,20 +36,20 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
 ; CHECK-NEXT:    vmov.f32 s3, s19
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -72,37 +71,37 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s16, s9
 ; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vadd.i32 q1, q4, q1
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
@@ -124,71 +123,71 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vmov.f32 s23, s26
 ; CHECK-NEXT:    vmov.f32 s19, s25
 ; CHECK-NEXT:    vadd.i32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-NEXT:    vadd.i32 q2, q4, q2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d14, d8
-; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
 ; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
 ; CHECK-NEXT:    vmov.f32 s31, s21
 ; CHECK-NEXT:    vadd.i32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s15, s23
 ; CHECK-NEXT:    vadd.i32 q3, q6, q3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
@@ -216,23 +215,22 @@ define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
 ; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    str r2, [sp]
-; CHECK-NEXT:    vmov.f64 d2, d0
 ; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f64 d6, d1
 ; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vldrh.u32 q1, [r3]
-; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.f32 s6, s4
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    strh r0, [r1, #2]
 ; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    strh r0, [r1]
 ; CHECK-NEXT:    add sp, #8
@@ -292,49 +290,49 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vmovx.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s2, s15
 ; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vins.f16 s19, s2
+; CHECK-NEXT:    vmov.f32 s2, s11
 ; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmovnb.i32 q5, q0
-; CHECK-NEXT:    vmov.f32 s2, s22
-; CHECK-NEXT:    vmovx.f16 s20, s5
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov.f64 d8, d2
-; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vins.f16 s16, s5
+; CHECK-NEXT:    vmovx.f16 s5, s8
 ; CHECK-NEXT:    vmov.f32 s17, s7
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s17, s5
+; CHECK-NEXT:    vmovx.f16 s5, s11
 ; CHECK-NEXT:    vmov.f32 s18, s10
-; CHECK-NEXT:    vins.f16 s18, s20
-; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmovx.f16 s11, s13
+; CHECK-NEXT:    vins.f16 s18, s5
+; CHECK-NEXT:    vmovx.f16 s5, s7
+; CHECK-NEXT:    vmovnb.i32 q5, q0
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmovx.f16 s14, s14
 ; CHECK-NEXT:    vmov.f32 s19, s13
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s20, s6
-; CHECK-NEXT:    vmovx.f16 s21, s7
-; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s7, s13
-; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vins.f16 s7, s15
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.f32 s22, s10
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vadd.i16 q1, q4, q5
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vins.f16 s11, s15
+; CHECK-NEXT:    vins.f16 s19, s14
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmovnb.i32 q3, q1
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vadd.i16 q1, q4, q1
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
@@ -355,103 +353,98 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d0, d2
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s8
 ; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmovx.f16 s16, s9
-; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmovx.f16 s14, s18
+; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vins.f16 s2, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vins.f16 s3, s14
+; CHECK-NEXT:    vmovx.f16 s14, s19
+; CHECK-NEXT:    vins.f16 s18, s14
 ; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
 ; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s13, s12
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vmov.f32 s3, s17
-; CHECK-NEXT:    vins.f16 s3, s20
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s5, s7
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
 ; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmovx.f16 s11, s17
 ; CHECK-NEXT:    vmov.f32 s23, s18
 ; CHECK-NEXT:    vmov.f32 s22, s16
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vins.f16 s10, s16
+; CHECK-NEXT:    vins.f16 s11, s19
 ; CHECK-NEXT:    vmovnb.i32 q6, q3
-; CHECK-NEXT:    vmov.f32 s14, s26
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s20, s6
-; CHECK-NEXT:    vmovx.f16 s21, s7
-; CHECK-NEXT:    vins.f16 s6, s16
-; CHECK-NEXT:    vmovx.f16 s7, s17
-; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vins.f16 s7, s19
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmovnb.i32 q4, q1
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s6, s18
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vadd.i16 q0, q0, q5
-; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmovx.f16 s6, s10
 ; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vins.f16 s4, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmovx.f16 s7, s19
 ; CHECK-NEXT:    vmov.f32 s27, s18
-; CHECK-NEXT:    vins.f16 s27, s20
-; CHECK-NEXT:    vmov.f64 d10, d4
-; CHECK-NEXT:    vins.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vins.f16 s27, s7
 ; CHECK-NEXT:    vmov.f32 s26, s16
-; CHECK-NEXT:    vmovx.f16 s0, s12
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vins.f16 s21, s0
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s15
 ; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    vmov.f32 s20, s8
 ; CHECK-NEXT:    vmovnb.i32 q7, q1
-; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vins.f16 s20, s6
+; CHECK-NEXT:    vmovx.f16 s6, s12
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s21, s6
+; CHECK-NEXT:    vmovx.f16 s6, s15
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmovx.f16 s15, s17
 ; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vins.f16 s22, s0
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s9, s13
+; CHECK-NEXT:    vins.f16 s14, s16
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vins.f16 s22, s6
+; CHECK-NEXT:    vmovx.f16 s6, s18
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vins.f16 s23, s6
+; CHECK-NEXT:    vmovnb.i32 q4, q2
+; CHECK-NEXT:    vmov.f32 s11, s15
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s6, s30
+; CHECK-NEXT:    vadd.i16 q2, q5, q2
 ; CHECK-NEXT:    vmov.f32 s7, s27
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vins.f16 s24, s10
-; CHECK-NEXT:    vins.f16 s23, s0
-; CHECK-NEXT:    vins.f16 s2, s16
-; CHECK-NEXT:    vmovx.f16 s25, s11
-; CHECK-NEXT:    vmovx.f16 s3, s17
-; CHECK-NEXT:    vins.f16 s25, s13
-; CHECK-NEXT:    vins.f16 s3, s19
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovnb.i32 q2, q6
-; CHECK-NEXT:    vmov.f32 s26, s10
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vadd.i16 q0, q5, q6
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vadd.i16 q1, q2, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -558,22 +551,21 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.16 q2[4], r2
 ; CHECK-NEXT:    vmov.16 q3[4], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[15]
 ; CHECK-NEXT:    vmovx.f16 s16, s6
@@ -581,6 +573,7 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s11, s5
 ; CHECK-NEXT:    vmov.16 q3[5], r0
 ; CHECK-NEXT:    vins.f16 s18, s16
+; CHECK-NEXT:    vins.f16 s10, s4
 ; CHECK-NEXT:    vins.f16 s11, s7
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
@@ -653,12 +646,11 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q5[11], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.8 q4[1], r0
@@ -681,19 +673,20 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.f32 s14, s22
 ; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q5[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.8 q5[15], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q6[11], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.f32 s18, s26
 ; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vadd.i8 q3, q4, q3
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[5]
@@ -753,19 +746,15 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s13, s3
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vmov r5, r8, d6
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov r0, r3, d5
+; CHECK-NEXT:    vmov r2, r4, d3
 ; CHECK-NEXT:    vmov r6, r7, d0
-; CHECK-NEXT:    vmov r0, r3, d1
-; CHECK-NEXT:    vmov lr, r12, d7
-; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    vmov r5, r8, d6
+; CHECK-NEXT:    vmov lr, r12, d1
 ; CHECK-NEXT:    adds.w r0, r0, lr
 ; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
@@ -795,50 +784,42 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s7, s13
-; CHECK-NEXT:    vmov.f32 s11, s15
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d10, d7
-; CHECK-NEXT:    vmov lr, r12, d3
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov r3, r8, d5
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vmov.f32 s22, s24
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s23, s25
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov r6, r7, d10
-; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    vmov r3, r8, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s24, s22
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov r6, r7, d12
 ; CHECK-NEXT:    adds.w r0, r5, lr
 ; CHECK-NEXT:    adc.w r5, r4, r12
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r4, r2, d6
+; CHECK-NEXT:    vmov r4, r2, d10
 ; CHECK-NEXT:    adc.w r12, r5, r8
 ; CHECK-NEXT:    vmov r5, r0, d8
 ; CHECK-NEXT:    adds r6, r6, r4
 ; CHECK-NEXT:    adcs r2, r7
 ; CHECK-NEXT:    adds r6, r6, r5
 ; CHECK-NEXT:    adc.w r8, r2, r0
-; CHECK-NEXT:    vmov r7, r4, d11
-; CHECK-NEXT:    vmov r2, r5, d7
+; CHECK-NEXT:    vmov r7, r4, d1
+; CHECK-NEXT:    vmov r2, r5, d9
 ; CHECK-NEXT:    vmov r3, r0, d0
 ; CHECK-NEXT:    adds r2, r2, r7
 ; CHECK-NEXT:    adc.w r7, r5, r4
-; CHECK-NEXT:    vmov r5, r4, d9
+; CHECK-NEXT:    vmov r5, r4, d7
 ; CHECK-NEXT:    adds r2, r2, r5
 ; CHECK-NEXT:    adcs r7, r4
 ; CHECK-NEXT:    vmov r5, r4, d2
@@ -853,7 +834,7 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adcs r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <12 x i64>, <12 x i64>* %src, align 4
@@ -874,7 +855,7 @@ define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldr s1, [r0, #16]
 ; CHECK-NEXT:    vldr s5, [r0, #20]
-; CHECK-NEXT:    vmov.f64 d6, d4
+; CHECK-NEXT:    vmov.f32 s12, s8
 ; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vmov.f32 s0, s9
 ; CHECK-NEXT:    vadd.f32 q0, q3, q0
@@ -898,20 +879,20 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
 ; CHECK-NEXT:    vmov.f32 s3, s19
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -933,37 +914,37 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s16, s9
 ; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vadd.f32 q1, q4, q1
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
@@ -985,71 +966,71 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vmov.f32 s23, s26
 ; CHECK-NEXT:    vmov.f32 s19, s25
 ; CHECK-NEXT:    vadd.f32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-NEXT:    vadd.f32 q2, q4, q2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d14, d8
-; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
 ; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
 ; CHECK-NEXT:    vmov.f32 s31, s21
 ; CHECK-NEXT:    vadd.f32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s15, s23
 ; CHECK-NEXT:    vadd.f32 q3, q6, q3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
@@ -1079,9 +1060,9 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    vins.f16 s8, s2
-; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s2, s1
 ; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vadd.f16 q1, q0, q2
 ; CHECK-NEXT:    vmov.f32 s0, s1
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
@@ -1102,32 +1083,29 @@ entry:
 define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmovx.f16 s4, s9
-; CHECK-NEXT:    vins.f16 s1, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s16, s5
-; CHECK-NEXT:    vins.f16 s12, s6
-; CHECK-NEXT:    vins.f16 s4, s16
-; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmovx.f16 s13, s7
-; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vins.f16 s1, s0
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s7, s0
 ; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s12, s6
 ; CHECK-NEXT:    vins.f16 s13, s9
 ; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vadd.f16 q1, q1, q3
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x half>, <12 x half>* %src, align 4
@@ -1143,49 +1121,47 @@ entry:
 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s17
-; CHECK-NEXT:    vmov.f32 s5, s16
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmov.f32 s11, s14
-; CHECK-NEXT:    vins.f16 s11, s20
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s20, s12
-; CHECK-NEXT:    vmov.f32 s28, s18
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s26, s16
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vins.f16 s21, s17
-; CHECK-NEXT:    vmovx.f16 s30, s14
-; CHECK-NEXT:    vmovx.f16 s23, s13
-; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vins.f16 s7, s6
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vins.f16 s13, s30
-; CHECK-NEXT:    vins.f16 s23, s15
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vmovx.f16 s22, s18
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vins.f16 s22, s12
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vins.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
@@ -1201,89 +1177,85 @@ entry:
 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld3_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s13
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vins.f16 s7, s6
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmovx.f16 s26, s12
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vmov.f32 s19, s10
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vmov.f32 s28, s14
-; CHECK-NEXT:    vmovx.f16 s30, s10
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vins.f16 s21, s13
-; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vmovx.f16 s23, s9
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
+; CHECK-NEXT:    vins.f16 s17, s13
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vins.f16 s9, s30
-; CHECK-NEXT:    vins.f16 s23, s11
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
 ; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s7, s19
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vadd.f16 q1, q0, q1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vins.f16 s4, s16
-; CHECK-NEXT:    vmovx.f16 s16, s13
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vmov.f32 s19, s10
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vins.f16 s7, s6
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vmov.f32 s28, s14
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vmovx.f16 s26, s12
-; CHECK-NEXT:    vins.f16 s21, s13
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vmovx.f16 s30, s10
-; CHECK-NEXT:    vmovx.f16 s23, s9
-; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vins.f16 s9, s30
-; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s2, s20
 ; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s7, s19
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4
@@ -1303,11 +1275,11 @@ define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vadd.f64 d4, d3, d0
-; CHECK-NEXT:    vadd.f64 d5, d6, d7
-; CHECK-NEXT:    vadd.f64 d1, d4, d1
-; CHECK-NEXT:    vadd.f64 d0, d5, d2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d0, d3, d0
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vadd.f64 d1, d0, d1
+; CHECK-NEXT:    vadd.f64 d0, d3, d2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1324,25 +1296,25 @@ entry:
 define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vld3_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vadd.f64 d5, d6, d7
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vadd.f64 d4, d1, d2
-; CHECK-NEXT:    vadd.f64 d10, d9, d6
-; CHECK-NEXT:    vadd.f64 d11, d12, d13
-; CHECK-NEXT:    vadd.f64 d3, d4, d3
-; CHECK-NEXT:    vadd.f64 d2, d5, d0
-; CHECK-NEXT:    vadd.f64 d1, d10, d7
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vadd.f64 d0, d11, d8
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d1, d1, d2
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vadd.f64 d2, d4, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vadd.f64 d4, d7, d4
+; CHECK-NEXT:    vadd.f64 d7, d8, d9
+; CHECK-NEXT:    vadd.f64 d1, d1, d3
+; CHECK-NEXT:    vadd.f64 d0, d2, d0
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vadd.f64 d2, d7, d6
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x double>, <12 x double>* %src, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index 06c10e0b7bb1a..8ddfb5fb44878 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -6,18 +6,14 @@
 define <16 x i32> *@vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x i32>, <16 x i32>* %src, align 4
@@ -38,18 +34,14 @@ entry:
 define <32 x i16> *@vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 4
@@ -70,18 +62,14 @@ entry:
 define <64 x i8> *@vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.8 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i8 q4, q2, q3
+; CHECK-NEXT:    vadd.i8 q2, q2, q3
 ; CHECK-NEXT:    vadd.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, q4
+; CHECK-NEXT:    vadd.i8 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i8>, <64 x i8>* %src, align 4
@@ -109,23 +97,19 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #64
-; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov r4, r8, d9
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r7, d1
-; CHECK-NEXT:    vmov r4, r8, d7
-; CHECK-NEXT:    vmov r3, r6, d5
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vmov r3, r6, d1
 ; CHECK-NEXT:    adds.w r2, r2, lr
 ; CHECK-NEXT:    adc.w r7, r7, r12
 ; CHECK-NEXT:    adds r3, r3, r4
@@ -166,18 +150,14 @@ entry:
 define <16 x float> *@vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld4_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x float>, <16 x float>* %src, align 4
@@ -198,18 +178,14 @@ entry:
 define <32 x half> *@vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index 74b6b8d7e2843..5058013576343 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -6,17 +6,17 @@
 define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s8, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s8, s3
+; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add.w r12, r2, r0
@@ -44,18 +44,14 @@ entry:
 define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x i32>, <16 x i32>* %src, align 4
@@ -79,7 +75,6 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
 ; CHECK-NEXT:    vadd.i32 q4, q2, q3
 ; CHECK-NEXT:    vadd.i32 q5, q0, q1
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
@@ -88,10 +83,9 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q5, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q5
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -111,12 +105,10 @@ entry:
 define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #112
-; CHECK-NEXT:    sub sp, #112
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    add.w r3, r0, #192
@@ -124,52 +116,40 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
-; CHECK-NEXT:    vadd.i32 q6, q0, q1
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q2, q6, q2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q2, q3, q4
-; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vstrw.32 q4, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q1, q3, q1
-; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vld40.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
+; CHECK-NEXT:    vld41.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vld42.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vld43.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.i32 q1, q5, q6
 ; CHECK-NEXT:    vadd.i32 q2, q3, q4
+; CHECK-NEXT:    vadd.i32 q0, q2, q1
+; CHECK-NEXT:    vld40.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q1, q6, q7
+; CHECK-NEXT:    vadd.i32 q2, q4, q5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vadd.i32 q1, q2, q1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #112
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i32>, <64 x i32>* %src, align 4
@@ -189,25 +169,25 @@ define void @vld4_v4i32_align1(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s16, s11
-; CHECK-NEXT:    vmov.f64 d10, d5
+; CHECK-NEXT:    vmov.f32 s20, s10
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s21, s14
-; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s23, s2
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s9
 ; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s22, s5
-; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.i32 q0, q2, q5
 ; CHECK-NEXT:    vadd.i32 q0, q0, q4
@@ -316,18 +296,14 @@ entry:
 define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 2
@@ -351,7 +327,6 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
 ; CHECK-NEXT:    vadd.i16 q4, q2, q3
 ; CHECK-NEXT:    vadd.i16 q5, q0, q1
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
@@ -360,10 +335,9 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q5, q2, q3
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q5
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -383,58 +357,56 @@ entry:
 define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s18, s5
+; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vins.f16 s18, s0
-; CHECK-NEXT:    vmovx.f16 s0, s11
 ; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s0, s11
 ; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vins.f16 s19, s0
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vmovx.f16 s24, s6
-; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmov.f32 s22, s5
 ; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmovx.f16 s12, s3
 ; CHECK-NEXT:    vins.f16 s16, s12
 ; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vmovx.f16 s17, s13
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmov.f32 s22, s5
 ; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vmovx.f16 s20, s15
 ; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s17, s20
 ; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmovx.f16 s1, s6
 ; CHECK-NEXT:    vmov.f32 s21, s13
 ; CHECK-NEXT:    vadd.i16 q4, q5, q4
 ; CHECK-NEXT:    vmovx.f16 s22, s4
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s8
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmov.f32 s6, s4
-; CHECK-NEXT:    vmov.f32 s7, s8
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s1, s10
 ; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vins.f16 s23, s1
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vins.f16 s20, s1
 ; CHECK-NEXT:    vmovx.f16 s21, s12
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s1, s14
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vins.f16 s21, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s8
 ; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 1
@@ -608,18 +580,14 @@ entry:
 define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.8 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i8 q4, q2, q3
+; CHECK-NEXT:    vadd.i8 q2, q2, q3
 ; CHECK-NEXT:    vadd.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, q4
+; CHECK-NEXT:    vadd.i8 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i8>, <64 x i8>* %src, align 1
@@ -646,23 +614,19 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov r0, r8, d9
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov r0, r8, d7
-; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vmov r5, r6, d1
 ; CHECK-NEXT:    adds.w r2, r2, lr
 ; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    vmov r4, r12, d2
@@ -706,54 +670,45 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s3, s21
+; CHECK-NEXT:    vmov r3, r2, d11
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s0, s26
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vmov lr, r12, d9
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s2
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s15, s3
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT:    vmov.f64 d4, d11
-; CHECK-NEXT:    vmov.f32 s9, s23
-; CHECK-NEXT:    vmov r3, r2, d7
-; CHECK-NEXT:    vmov r4, r5, d3
-; CHECK-NEXT:    vmov.f32 s10, s18
-; CHECK-NEXT:    vmov.f32 s11, s19
-; CHECK-NEXT:    vmov.f32 s22, s16
-; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmov q7, q5
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vmov r0, r6, d15
-; CHECK-NEXT:    vmov.f64 d14, d11
-; CHECK-NEXT:    vmov.f32 s29, s23
-; CHECK-NEXT:    vmov lr, r12, d5
-; CHECK-NEXT:    vmov.f32 s30, s26
-; CHECK-NEXT:    vmov.f32 s22, s24
-; CHECK-NEXT:    vmov.f32 s31, s27
-; CHECK-NEXT:    vmov.f32 s23, s25
-; CHECK-NEXT:    vmov.f64 d12, d9
+; CHECK-NEXT:    vmov.f32 s6, s28
+; CHECK-NEXT:    vmov.f32 s7, s29
+; CHECK-NEXT:    vmov.f32 s10, s20
+; CHECK-NEXT:    vmov.f32 s11, s21
+; CHECK-NEXT:    vmov r0, r6, d1
 ; CHECK-NEXT:    adds r7, r4, r3
+; CHECK-NEXT:    vmov r4, r8, d0
 ; CHECK-NEXT:    adcs r5, r2
-; CHECK-NEXT:    vmov r4, r8, d14
-; CHECK-NEXT:    vmov r2, r3, d10
-; CHECK-NEXT:    vmov.f32 s25, s19
-; CHECK-NEXT:    vmov.f32 s26, s2
-; CHECK-NEXT:    vmov.f32 s18, s0
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vmov.f32 s19, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, r3, d12
+; CHECK-NEXT:    vmov.f32 s0, s18
+; CHECK-NEXT:    vmov.f32 s1, s19
 ; CHECK-NEXT:    adds.w r0, r0, lr
 ; CHECK-NEXT:    adc.w r6, r6, r12
 ; CHECK-NEXT:    adds.w lr, r0, r7
 ; CHECK-NEXT:    adc.w r12, r6, r5
-; CHECK-NEXT:    vmov r6, r5, d12
+; CHECK-NEXT:    vmov r6, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    adc.w r3, r3, r8
@@ -762,11 +717,11 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adds.w r9, r6, r2
 ; CHECK-NEXT:    adc.w r8, r0, r3
 ; CHECK-NEXT:    vmov r5, r4, d15
-; CHECK-NEXT:    vmov r3, r6, d11
-; CHECK-NEXT:    vmov r7, r0, d9
+; CHECK-NEXT:    vmov r3, r6, d3
+; CHECK-NEXT:    vmov r7, r0, d5
 ; CHECK-NEXT:    adds r3, r3, r5
 ; CHECK-NEXT:    adcs r6, r4
-; CHECK-NEXT:    vmov r5, r4, d13
+; CHECK-NEXT:    vmov r5, r4, d11
 ; CHECK-NEXT:    adds r5, r5, r7
 ; CHECK-NEXT:    adcs r0, r4
 ; CHECK-NEXT:    adds r3, r3, r5
@@ -808,11 +763,11 @@ entry:
 define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) {
 ; CHECK-LABEL: vld4_v2f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s7
-; CHECK-NEXT:    vmov.f64 d6, d3
 ; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s13, s2
 ; CHECK-NEXT:    vadd.f32 q2, q3, q2
 ; CHECK-NEXT:    vmov.f32 s12, s5
@@ -838,18 +793,14 @@ entry:
 define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld4_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x float>, <16 x float>* %src, align 4
@@ -873,7 +824,6 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) {
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
 ; CHECK-NEXT:    vadd.f32 q4, q2, q3
 ; CHECK-NEXT:    vadd.f32 q5, q0, q1
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
@@ -882,10 +832,9 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q5, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q0, q0, q5
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -905,12 +854,10 @@ entry:
 define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
 ; CHECK-LABEL: vld4_v16f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #112
-; CHECK-NEXT:    sub sp, #112
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    add.w r3, r0, #192
@@ -918,52 +865,40 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
-; CHECK-NEXT:    vadd.f32 q6, q0, q1
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 q0, q1, q0
-; CHECK-NEXT:    vadd.f32 q2, q6, q2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vadd.f32 q2, q3, q4
-; CHECK-NEXT:    vadd.f32 q0, q0, q2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vstrw.32 q4, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 q0, q0, q2
-; CHECK-NEXT:    vadd.f32 q1, q3, q1
-; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vld40.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
+; CHECK-NEXT:    vld41.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vld42.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vld43.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.f32 q1, q5, q6
 ; CHECK-NEXT:    vadd.f32 q2, q3, q4
+; CHECK-NEXT:    vadd.f32 q0, q2, q1
+; CHECK-NEXT:    vld40.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    vadd.f32 q1, q6, q7
+; CHECK-NEXT:    vadd.f32 q2, q4, q5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vadd.f32 q1, q2, q1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #112
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x float>, <64 x float>* %src, align 4
@@ -983,25 +918,25 @@ define void @vld4_v4f32_align1(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s16, s11
-; CHECK-NEXT:    vmov.f64 d10, d5
+; CHECK-NEXT:    vmov.f32 s20, s10
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s21, s14
-; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s23, s2
 ; CHECK-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s9
 ; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s22, s5
-; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.f32 q0, q2, q5
 ; CHECK-NEXT:    vadd.f32 q0, q0, q4
@@ -1027,17 +962,17 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK-LABEL: vld4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vadd.f16 q1, q1, q2
+; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
@@ -1058,27 +993,27 @@ entry:
 define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld4_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s6, s2
 ; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s16, s3
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vmovx.f16 s5, s8
-; CHECK-NEXT:    vins.f16 s5, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s5, s8
+; CHECK-NEXT:    vmovx.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s12, s2
 ; CHECK-NEXT:    vmovx.f16 s13, s9
+; CHECK-NEXT:    vmovx.f16 s2, s11
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vins.f16 s13, s16
 ; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vins.f16 s13, s2
 ; CHECK-NEXT:    vmov.f32 s1, s8
 ; CHECK-NEXT:    vmov.f32 s17, s9
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
@@ -1086,7 +1021,7 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x half>, <16 x half>* %src, align 2
@@ -1104,18 +1039,14 @@ entry:
 define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 2
@@ -1133,37 +1064,25 @@ entry:
 define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld4_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
 ; CHECK-NEXT:    vld40.16 {q4, q5, q6, q7}, [r0]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vld41.16 {q4, q5, q6, q7}, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vld42.16 {q4, q5, q6, q7}, [r0]
 ; CHECK-NEXT:    vld43.16 {q4, q5, q6, q7}, [r0]
-; CHECK-NEXT:    @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7
-; CHECK-NEXT:    vadd.f16 q0, q6, q7
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vadd.f16 q6, q6, q7
 ; CHECK-NEXT:    vadd.f16 q4, q4, q5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f16 q4, q4, q0
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.f16 q4, q4, q6
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x half>, <64 x half>* %src, align 2
@@ -1185,48 +1104,48 @@ define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vmovx.f16 s18, s1
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s18, s4
-; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s19, s4
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s24, s2
-; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s22, s0
+; CHECK-NEXT:    vmovx.f16 s3, s2
 ; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vins.f16 s16, s12
 ; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s22, s3
+; CHECK-NEXT:    vmovx.f16 s23, s8
 ; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s3, s10
 ; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s22, s0
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vmovx.f16 s24, s10
-; CHECK-NEXT:    vmovx.f16 s23, s8
-; CHECK-NEXT:    vins.f16 s13, s15
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vins.f16 s23, s3
 ; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmovx.f16 s3, s6
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s20, s3
 ; CHECK-NEXT:    vmovx.f16 s21, s12
+; CHECK-NEXT:    vmovx.f16 s3, s14
 ; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmov.f32 s27, s9
 ; CHECK-NEXT:    vmov.f32 s24, s5
-; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s21, s3
+; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s25, s13
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vadd.f16 q4, q6, q4
 ; CHECK-NEXT:    vmov.f32 s7, s8
-; CHECK-NEXT:    vmov.f32 s25, s13
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vadd.f16 q4, q6, q4
 ; CHECK-NEXT:    vadd.f16 q0, q1, q5
 ; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index d26757fc99e89..930212ddc59c0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -30,9 +30,9 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmul.f16 q2, q0, q0
-; CHECK-NEXT:    vfma.f16 q2, q1, q1
-; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    vmul.f16 q0, q0, q0
+; CHECK-NEXT:    vfma.f16 q0, q1, q1
+; CHECK-NEXT:    vstrb.8 q0, [r1], #16
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2
@@ -159,9 +159,9 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmul.f32 q2, q0, q0
-; CHECK-NEXT:    vfma.f32 q2, q1, q1
-; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    vmul.f32 q0, q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, q1
+; CHECK-NEXT:    vstrb.8 q0, [r1], #16
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index c073ffbe4a42e..e69d06d475300 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -6,125 +6,119 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    mul r12, r3, r2
 ; CHECK-NEXT:    lsrs.w r2, r12, #2
 ; CHECK-NEXT:    beq.w .LBB0_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mvn r3, #7
-; CHECK-NEXT:    ldr r2, [sp, #88]
+; CHECK-NEXT:    ldr r2, [sp, #56]
 ; CHECK-NEXT:    and.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q4, [r0, #32]
-; CHECK-NEXT:    vldrh.u16 q5, [r0, #48]
+; CHECK-NEXT:    vldrh.u16 q1, [r0, #32]
+; CHECK-NEXT:    vldrh.u16 q4, [r0, #48]
 ; CHECK-NEXT:    vldrh.u16 q3, [r0], #64
-; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmovx.f16 s26, s4
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vldrh.u16 q5, [r0, #-48]
+; CHECK-NEXT:    vmovx.f16 s27, s16
+; CHECK-NEXT:    vins.f16 s26, s6
+; CHECK-NEXT:    vmovx.f16 s6, s18
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vmovx.f16 s10, s5
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vins.f16 s10, s8
+; CHECK-NEXT:    vins.f16 s27, s6
+; CHECK-NEXT:    vmovx.f16 s6, s14
 ; CHECK-NEXT:    vmovx.f16 s8, s19
-; CHECK-NEXT:    vldrh.u16 q6, [r0, #-48]
-; CHECK-NEXT:    vins.f16 s2, s19
-; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmovx.f16 s5, s25
-; CHECK-NEXT:    vins.f16 s3, s23
-; CHECK-NEXT:    vmovx.f16 s6, s17
+; CHECK-NEXT:    vmovx.f16 s11, s17
 ; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmovx.f16 s8, s23
-; CHECK-NEXT:    vmovx.f16 s7, s21
-; CHECK-NEXT:    vins.f16 s0, s15
-; CHECK-NEXT:    vins.f16 s7, s8
-; CHECK-NEXT:    vmovx.f16 s8, s15
-; CHECK-NEXT:    vmovx.f16 s4, s13
-; CHECK-NEXT:    vins.f16 s25, s27
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s27
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmov.f32 s1, s25
-; CHECK-NEXT:    vmul.f16 q2, q1, r2
-; CHECK-NEXT:    vmul.f16 q0, q0, r2
-; CHECK-NEXT:    vmovx.f16 s7, s0
-; CHECK-NEXT:    vmovx.f16 s28, s8
-; CHECK-NEXT:    vins.f16 s7, s28
-; CHECK-NEXT:    vmovx.f16 s30, s16
-; CHECK-NEXT:    vmovx.f16 s31, s20
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vins.f16 s16, s18
-; CHECK-NEXT:    vmovx.f16 s29, s24
-; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vins.f16 s11, s8
+; CHECK-NEXT:    vmovx.f16 s25, s20
+; CHECK-NEXT:    vins.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s6, s22
+; CHECK-NEXT:    vmovx.f16 s1, s15
+; CHECK-NEXT:    vmovx.f16 s8, s13
 ; CHECK-NEXT:    vins.f16 s20, s22
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmovx.f16 s4, s22
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s25, s6
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vins.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s9, s21
+; CHECK-NEXT:    vins.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s1, s23
 ; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vins.f16 s31, s4
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vins.f16 s24, s26
-; CHECK-NEXT:    vmov.f32 s15, s20
-; CHECK-NEXT:    vins.f16 s28, s4
-; CHECK-NEXT:    vmovx.f16 s4, s26
-; CHECK-NEXT:    vmov.f32 s13, s24
-; CHECK-NEXT:    vins.f16 s29, s4
+; CHECK-NEXT:    vins.f16 s21, s23
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmov.f32 s15, s16
+; CHECK-NEXT:    vins.f16 s9, s1
+; CHECK-NEXT:    vmov.f32 s13, s20
+; CHECK-NEXT:    vmul.f16 q6, q6, r2
 ; CHECK-NEXT:    vmul.f16 q3, q3, r2
-; CHECK-NEXT:    vmul.f16 q7, q7, r2
+; CHECK-NEXT:    vins.f16 s2, s7
+; CHECK-NEXT:    vins.f16 s3, s19
+; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vmul.f16 q0, q0, r2
 ; CHECK-NEXT:    vmovx.f16 s4, s12
-; CHECK-NEXT:    vmovx.f16 s6, s28
+; CHECK-NEXT:    vmovx.f16 s6, s24
+; CHECK-NEXT:    vmul.f16 q2, q2, r2
+; CHECK-NEXT:    vmovx.f16 s7, s0
 ; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s8
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s5, s9
-; CHECK-NEXT:    vins.f16 s12, s28
-; CHECK-NEXT:    vins.f16 s6, s5
-; CHECK-NEXT:    vmovx.f16 s18, s13
-; CHECK-NEXT:    vmovx.f16 s5, s29
-; CHECK-NEXT:    vins.f16 s1, s9
-; CHECK-NEXT:    vins.f16 s18, s5
-; CHECK-NEXT:    vmovx.f16 s23, s2
-; CHECK-NEXT:    vmovx.f16 s5, s10
-; CHECK-NEXT:    vins.f16 s2, s10
-; CHECK-NEXT:    vins.f16 s23, s5
-; CHECK-NEXT:    vins.f16 s13, s29
-; CHECK-NEXT:    vmovx.f16 s27, s3
+; CHECK-NEXT:    vmovx.f16 s5, s1
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmovx.f16 s8, s25
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmovx.f16 s19, s2
+; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s19, s8
+; CHECK-NEXT:    vmovx.f16 s8, s26
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmovx.f16 s23, s3
 ; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vins.f16 s27, s8
-; CHECK-NEXT:    vins.f16 s14, s30
-; CHECK-NEXT:    vmovx.f16 s26, s15
-; CHECK-NEXT:    vins.f16 s15, s31
-; CHECK-NEXT:    vmovx.f16 s8, s31
+; CHECK-NEXT:    vins.f16 s14, s26
+; CHECK-NEXT:    vins.f16 s23, s8
+; CHECK-NEXT:    vmovx.f16 s22, s15
+; CHECK-NEXT:    vins.f16 s15, s27
+; CHECK-NEXT:    vmovx.f16 s8, s27
+; CHECK-NEXT:    vins.f16 s12, s24
+; CHECK-NEXT:    vins.f16 s13, s25
 ; CHECK-NEXT:    vins.f16 s3, s11
-; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vins.f16 s1, s9
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vins.f16 s22, s8
 ; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vmovx.f16 s5, s30
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s0
 ; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s29, s0
-; CHECK-NEXT:    vins.f16 s22, s5
+; CHECK-NEXT:    vmov q6, q0
+; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s11, s31
-; CHECK-NEXT:    vmov q7, q0
-; CHECK-NEXT:    vmov.f32 s31, s6
-; CHECK-NEXT:    vmov.f32 s16, s13
-; CHECK-NEXT:    vmov.f32 s21, s2
-; CHECK-NEXT:    vmov.f32 s25, s3
-; CHECK-NEXT:    vmov.f32 s17, s29
-; CHECK-NEXT:    vmov.f32 s20, s14
-; CHECK-NEXT:    vmov.f32 s24, s15
-; CHECK-NEXT:    vstrh.16 q5, [r1, #32]
-; CHECK-NEXT:    vstrh.16 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vmov.f32 s21, s3
+; CHECK-NEXT:    vstrh.16 q4, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s20, s15
+; CHECK-NEXT:    vmov.f32 s7, s5
+; CHECK-NEXT:    vstrh.16 q5, [r1, #48]
 ; CHECK-NEXT:    vstrh.16 q2, [r1], #64
-; CHECK-NEXT:    vmov.f32 s19, s31
-; CHECK-NEXT:    vstrh.16 q4, [r1, #-48]
+; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s5, s25
+; CHECK-NEXT:    vstrh.16 q1, [r1, #-48]
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: @ %while.end
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index 9b0bc7e72516c..f2d9593f26418 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -176,8 +176,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: vmovn64_b2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -185,8 +185,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECKBE-LABEL: vmovn64_b2:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vmov.f32 s4, s6
-; CHECKBE-NEXT:    vmov.f32 s5, s7
 ; CHECKBE-NEXT:    vmov.f32 s6, s0
+; CHECKBE-NEXT:    vmov.f32 s5, s7
 ; CHECKBE-NEXT:    vmov.f32 s7, s1
 ; CHECKBE-NEXT:    vmov q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -199,16 +199,16 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: vmovn64_b3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn64_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vmov.f32 s0, s2
-; CHECKBE-NEXT:    vmov.f32 s1, s3
 ; CHECKBE-NEXT:    vmov.f32 s2, s4
+; CHECKBE-NEXT:    vmov.f32 s1, s3
 ; CHECKBE-NEXT:    vmov.f32 s3, s5
 ; CHECKBE-NEXT:    bx lr
 entry:
@@ -301,11 +301,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn32_b2:
@@ -326,22 +326,21 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn32_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vrev64.32 q2, q1
 ; CHECKBE-NEXT:    vrev64.32 q1, q0
-; CHECKBE-NEXT:    vmov.f32 s12, s5
-; CHECKBE-NEXT:    vmov.f32 s13, s8
-; CHECKBE-NEXT:    vmov.f32 s14, s7
-; CHECKBE-NEXT:    vmov.f32 s15, s10
-; CHECKBE-NEXT:    vrev64.32 q0, q3
+; CHECKBE-NEXT:    vmov.f32 s4, s5
+; CHECKBE-NEXT:    vmov.f32 s6, s7
+; CHECKBE-NEXT:    vmov.f32 s5, s8
+; CHECKBE-NEXT:    vmov.f32 s7, s10
+; CHECKBE-NEXT:    vrev64.32 q0, q1
 ; CHECKBE-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
@@ -450,15 +449,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s1
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s6
-; CHECK-NEXT:    vins.f16 s10, s2
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vins.f16 s11, s3
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b2:
@@ -466,12 +465,12 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECKBE-NEXT:    vrev64.16 q2, q0
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    vmovx.f16 s5, s1
-; CHECKBE-NEXT:    vins.f16 s5, s9
 ; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vins.f16 s4, s8
 ; CHECKBE-NEXT:    vmovx.f16 s6, s2
-; CHECKBE-NEXT:    vins.f16 s6, s10
 ; CHECKBE-NEXT:    vmovx.f16 s7, s3
+; CHECKBE-NEXT:    vins.f16 s5, s9
+; CHECKBE-NEXT:    vins.f16 s4, s8
+; CHECKBE-NEXT:    vins.f16 s6, s10
 ; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -483,28 +482,27 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovx.f16 s1, s9
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s3, s3
 ; CHECK-NEXT:    vins.f16 s1, s5
-; CHECK-NEXT:    vmovx.f16 s0, s8
 ; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s2, s10
 ; CHECK-NEXT:    vins.f16 s2, s6
-; CHECK-NEXT:    vmovx.f16 s3, s11
 ; CHECK-NEXT:    vins.f16 s3, s7
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q3, q0
 ; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vmovx.f16 s5, s13
+; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vmovx.f16 s5, s5
+; CHECKBE-NEXT:    vmovx.f16 s4, s4
+; CHECKBE-NEXT:    vmovx.f16 s6, s6
+; CHECKBE-NEXT:    vmovx.f16 s7, s7
 ; CHECKBE-NEXT:    vins.f16 s5, s9
-; CHECKBE-NEXT:    vmovx.f16 s4, s12
 ; CHECKBE-NEXT:    vins.f16 s4, s8
-; CHECKBE-NEXT:    vmovx.f16 s6, s14
 ; CHECKBE-NEXT:    vins.f16 s6, s10
-; CHECKBE-NEXT:    vmovx.f16 s7, s15
 ; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
index e4ae6e5dff3a2..b8ddde719a67e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
@@ -148,11 +148,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) {
 ; CHECK-LABEL: vmovn64_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s10, s0
-; CHECK-NEXT:    vmov.f32 s11, s1
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 0>
@@ -163,11 +163,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) {
 ; CHECK-LABEL: vmovn64_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 1, i32 2>
@@ -232,11 +232,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) {
 ; CHECK-LABEL: vmovn32_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 5, i32 0, i32 7, i32 2>
@@ -247,11 +247,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) {
 ; CHECK-LABEL: vmovn32_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
@@ -314,15 +314,15 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s1
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s6
-; CHECK-NEXT:    vins.f16 s10, s2
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vins.f16 s11, s3
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 9, i32 0, i32 11, i32 2, i32 13, i32 4, i32 15, i32 6>
@@ -333,15 +333,15 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s1
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vins.f16 s10, s6
-; CHECK-NEXT:    vmovx.f16 s11, s3
-; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s1, s5
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s3, s7
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 5, i32 12, i32 7, i32 14>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 7e2374f2885f5..f66eb8584a0bd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -190,15 +190,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q3
-; CHECK-NEXT:    vmullb.s32 q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -213,15 +210,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.s32 q2, q3, q0
-; CHECK-NEXT:    vmullb.s32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -241,8 +235,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    umull lr, r12, r1, r0
 ; CHECK-NEXT:    umull r2, r5, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
@@ -252,10 +246,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    mla r5, r3, r2, r5
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    mla r3, r3, r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r5, lr, r4, r0
 ; CHECK-NEXT:    umull r3, r12, r1, r0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
@@ -286,8 +280,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    umull lr, r12, r0, r1
 ; CHECK-NEXT:    umull r2, r5, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
@@ -296,10 +290,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    mla r1, r4, r1, r2
 ; CHECK-NEXT:    asrs r2, r3, #31
 ; CHECK-NEXT:    mla r2, r0, r2, r5
-; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    mla r2, r4, r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r3, lr, r0, r5
 ; CHECK-NEXT:    umull r2, r12, r0, r1
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
@@ -474,15 +468,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.u32 q2, q0, q3
-; CHECK-NEXT:    vmullb.u32 q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.u32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -497,15 +488,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.u32 q2, q3, q0
-; CHECK-NEXT:    vmullb.u32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.u32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -522,13 +510,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    umull r1, r12, r1, r0
 ; CHECK-NEXT:    umull r3, r2, r3, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r12
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    umull r1, r2, r1, r0
@@ -551,13 +539,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    umull r1, r12, r0, r1
 ; CHECK-NEXT:    umull r3, r2, r0, r3
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r12
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    umull r1, r2, r0, r1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
index ebaeae88af718..eafbf41bc6241 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
@@ -72,16 +72,16 @@ entry:
 define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vst2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r1, #32
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #16
+; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -144,11 +144,11 @@ entry:
 define <4 x double> *@vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vst2_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d5, d0
 ; CHECK-NEXT:    vmov.f64 d0, d3
+; CHECK-NEXT:    vmov.f64 d4, d2
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1], #32
 ; CHECK-NEXT:    mov r0, r1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 67a606396127e..c749b36416f66 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -44,8 +44,8 @@ define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
@@ -68,10 +68,10 @@ define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
@@ -100,17 +100,17 @@ entry:
 define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v4i32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s12, s4
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s1
 ; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -186,8 +186,8 @@ define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]!
@@ -207,34 +207,31 @@ entry:
 define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s1, s10
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vmovx.f16 s0, s6
 ; CHECK-NEXT:    vins.f16 s10, s6
-; CHECK-NEXT:    vmov.f32 s0, s10
-; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vmovx.f16 s6, s7
 ; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmovx.f16 s14, s4
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s3, s6
+; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmovx.f16 s15, s9
+; CHECK-NEXT:    vins.f16 s9, s5
 ; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.f32 s17, s12
+; CHECK-NEXT:    vins.f16 s1, s0
+; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vins.f16 s15, s4
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s13, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s19, s9
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmov.f32 s18, s9
-; CHECK-NEXT:    vins.f16 s19, s4
-; CHECK-NEXT:    vstrb.8 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov.f32 s14, s9
+; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -328,12 +325,12 @@ entry:
 define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vst2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vmov.f32 s0, s6
 ; CHECK-NEXT:    vstrb.8 q2, [r1], #16
 ; CHECK-NEXT:    vmov.f32 s1, s7
@@ -354,25 +351,25 @@ define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d6, d1
-; CHECK-NEXT:    vmov.f64 d10, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s20, s6
 ; CHECK-NEXT:    vmov.f32 s21, s7
 ; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s3, s17
-; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s14, s18
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #48
-; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s23, s11
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #-32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -429,8 +426,8 @@ define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) {
 ; CHECK-LABEL: vst2_v8f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
@@ -453,10 +450,10 @@ define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
@@ -485,17 +482,17 @@ entry:
 define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) {
 ; CHECK-LABEL: vst2_v4f32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s12, s4
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s1
 ; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -543,19 +540,19 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s2, s0
 ; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vins.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s10, s4
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov.f32 s5, s8
-; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vstrh.16 q1, [r1]
+; CHECK-NEXT:    vmovx.f16 s6, s5
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s9, s2
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s11, s4
+; CHECK-NEXT:    vstrh.16 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -589,8 +586,8 @@ define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst2_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst20.16 {q2, q3}, [r1]
 ; CHECK-NEXT:    vst21.16 {q2, q3}, [r1]!
@@ -610,32 +607,32 @@ entry:
 define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s1, s0
 ; CHECK-NEXT:    vmovx.f16 s3, s7
-; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s3, s0
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s0, s8
 ; CHECK-NEXT:    vins.f16 s7, s11
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s14, s8
 ; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmovx.f16 s14, s5
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vins.f16 s5, s9
-; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vins.f16 s8, s0
 ; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vins.f16 s14, s8
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.f32 s9, s12
-; CHECK-NEXT:    vmov.f32 s10, s5
 ; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vmov.f32 s13, s10
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vstrb.8 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s8
+; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -652,8 +649,8 @@ entry:
 define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vst2_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d4, d3
 ; CHECK-NEXT:    vmov.f64 d5, d1
 ; CHECK-NEXT:    vmov.f64 d3, d0
@@ -675,17 +672,17 @@ define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vmov.f64 d8, d4
 ; CHECK-NEXT:    vmov.f64 d9, d0
 ; CHECK-NEXT:    vmov.f64 d0, d5
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vmov.f64 d4, d6
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f64 d5, d2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d4, d6
 ; CHECK-NEXT:    vmov.f64 d2, d7
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 1e46dd1b256f5..7d4763fdeb03a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -12,16 +12,15 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
 ; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov.32 q0[0], r4
 ; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q0[0], r4
 ; CHECK-NEXT:    vmov.f32 s8, s7
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
@@ -44,22 +43,22 @@ define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s19, s13
 ; CHECK-NEXT:    vmov.f32 s9, s1
 ; CHECK-NEXT:    vmov.f32 s18, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vstrw.32 q4, [r1]
 ; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vmov.f32 s1, s15
 ; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s16, s12
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vmov.f32 s1, s15
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -84,41 +83,41 @@ define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d10, d8
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s21, s28
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov.f32 s29, s12
-; CHECK-NEXT:    vmov.f32 s9, s27
-; CHECK-NEXT:    vmov.f32 s31, s25
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s20, s28
+; CHECK-NEXT:    vmov.f32 s9, s19
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s10, s15
+; CHECK-NEXT:    vmov.f32 s23, s29
+; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmov.f32 s21, s24
+; CHECK-NEXT:    vmov.f32 s29, s12
+; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s0, s13
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s3, s14
-; CHECK-NEXT:    vmov.f32 s2, s26
-; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s12, s25
 ; CHECK-NEXT:    vmov.f32 s13, s5
-; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s12, s25
 ; CHECK-NEXT:    vmov.f32 s15, s26
-; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s6, s27
+; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s6, s27
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -142,108 +141,106 @@ define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    .pad #144
+; CHECK-NEXT:    sub sp, #144
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s16, s1
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s17, s9
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s19, s2
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s19, s2
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s9
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s17, s27
 ; CHECK-NEXT:    vmov.f32 s19, s11
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov.f64 d8, d3
-; CHECK-NEXT:    vmov.f32 s17, s31
+; CHECK-NEXT:    vmov.f32 s16, s6
 ; CHECK-NEXT:    vmov.f32 s19, s7
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d8, d12
-; CHECK-NEXT:    vmov.f32 s17, s0
-; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s18, s31
+; CHECK-NEXT:    vstrw.32 q4, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vmov q2, q7
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f64 d4, d14
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f64 d14, d4
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s7, s14
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s16, s24
+; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s1
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d14
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s7, s30
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vmov.f64 d0, d10
 ; CHECK-NEXT:    vmov.f32 s16, s5
-; CHECK-NEXT:    vmov.f32 s17, s1
 ; CHECK-NEXT:    vmov.f32 s19, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s20
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vmov.f32 s31, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s29, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vmov.f32 s30, s4
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f64 d10, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s15, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vstrw.32 q3, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vmov.f32 s6, s20
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s23, s10
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -303,23 +300,23 @@ define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0]
 ; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmov r0, r5, d2
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov lr, r4, d1
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.f32 s15, s11
+; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov r12, s6
+; CHECK-NEXT:    vmov.32 q1[2], r4
 ; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vstrh.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r12, s6
 ; CHECK-NEXT:    vmov.16 q0[3], r5
-; CHECK-NEXT:    vstrh.32 q3, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q0[4], r3
 ; CHECK-NEXT:    vmov.16 q0[5], r4
 ; CHECK-NEXT:    vmov.16 q0[6], r12
@@ -343,64 +340,52 @@ entry:
 define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
 ; CHECK-LABEL: vst3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f32 s0, s8
 ; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmovx.f16 s20, s8
 ; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vmov.f32 s12, s9
 ; CHECK-NEXT:    vins.f16 s12, s5
 ; CHECK-NEXT:    vmov.16 q0[4], r2
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.f32 s17, s12
-; CHECK-NEXT:    vmov.f32 s18, s12
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s14
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmovx.f16 s8, s8
 ; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s1, s12
 ; CHECK-NEXT:    vins.f16 s17, s7
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s8, s12
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s14
+; CHECK-NEXT:    vins.f16 s16, s8
 ; CHECK-NEXT:    vmovx.f16 s19, s7
-; CHECK-NEXT:    vrev32.16 q1, q1
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmovx.f16 s8, s15
 ; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmovx.f16 s24, s17
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmovx.f16 s24, s22
-; CHECK-NEXT:    vins.f16 s18, s24
-; CHECK-NEXT:    vmov.f32 s12, s13
-; CHECK-NEXT:    vmov.f32 s22, s18
-; CHECK-NEXT:    vmov.f32 s17, s21
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vins.f16 s12, s20
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vins.f16 s14, s20
+; CHECK-NEXT:    vins.f16 s19, s8
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s17, s8
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s8, s13
+; CHECK-NEXT:    vins.f16 s8, s12
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vins.f16 s14, s12
+; CHECK-NEXT:    vrev32.16 q1, q1
+; CHECK-NEXT:    vmovx.f16 s12, s13
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s15, s14
-; CHECK-NEXT:    vmov.f32 s14, s10
-; CHECK-NEXT:    vmovx.f16 s8, s13
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vins.f16 s14, s8
-; CHECK-NEXT:    vmov.f32 s6, s14
-; CHECK-NEXT:    vmov.f32 s13, s5
-; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -421,135 +406,112 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s14
-; CHECK-NEXT:    vmovx.f16 s8, s22
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s0, s15
-; CHECK-NEXT:    vins.f16 s9, s23
-; CHECK-NEXT:    vmov.u16 r2, q6[1]
-; CHECK-NEXT:    vmovx.f16 s11, s23
-; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s11, s0
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vmovx.f16 s4, s9
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vmov.f32 s12, s4
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vins.f16 s13, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s18, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s28, s4
-; CHECK-NEXT:    vins.f16 s8, s24
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vins.f16 s11, s25
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vins.f16 s20, s4
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vmovx.f16 s4, s24
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vins.f16 s23, s5
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s5, s24
-; CHECK-NEXT:    vmov.f32 s6, s24
-; CHECK-NEXT:    vins.f16 s5, s28
-; CHECK-NEXT:    vmovx.f16 s28, s6
-; CHECK-NEXT:    vins.f16 s10, s28
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmov.f64 d14, d2
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vins.f16 s0, s21
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s28, s0
+; CHECK-NEXT:    vins.f16 s5, s4
+; CHECK-NEXT:    vmovx.f16 s4, s24
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vins.f16 s28, s12
+; CHECK-NEXT:    vins.f16 s22, s4
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.f32 s31, s0
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s29, s4
-; CHECK-NEXT:    vmovx.f16 s4, s4
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmovx.f16 s4, s26
-; CHECK-NEXT:    vmov.f32 s2, s30
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vmov.f32 s12, s13
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vins.f16 s4, s13
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vmov.f32 s31, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s29, s8
+; CHECK-NEXT:    vins.f16 s29, s0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s30, s0
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s0, s26
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vmovx.f16 s0, s27
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s27
+; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vmov.f32 s12, s25
+; CHECK-NEXT:    vins.f16 s14, s0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s13
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
-; CHECK-NEXT:    vmovx.f16 s3, s7
-; CHECK-NEXT:    vmovx.f16 s4, s27
-; CHECK-NEXT:    vins.f16 s3, s4
-; CHECK-NEXT:    vmov.f32 s5, s23
-; CHECK-NEXT:    vmov.f32 s2, s27
-; CHECK-NEXT:    vmovx.f16 s16, s1
-; CHECK-NEXT:    vmov.f32 s6, s23
-; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vrev32.16 q2, q2
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vins.f16 s9, s6
+; CHECK-NEXT:    vmovx.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s8, s18
+; CHECK-NEXT:    vmov.f32 s10, s18
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov.f32 s24, s25
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vins.f16 s12, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s14, s20
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s15, s14
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vstr s16, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s1, s5
-; CHECK-NEXT:    vrev32.16 q5, q4
-; CHECK-NEXT:    vldr s16, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s21, s16
-; CHECK-NEXT:    vmovx.f16 s16, s22
-; CHECK-NEXT:    vins.f16 s14, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmovx.f16 s4, s17
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vins.f16 s24, s4
-; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vins.f16 s26, s4
-; CHECK-NEXT:    vmov.f32 s13, s21
-; CHECK-NEXT:    vmov.f32 s27, s26
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s26, s18
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmovx.f16 s4, s25
+; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vmov.f32 s15, s26
+; CHECK-NEXT:    vmovx.f16 s8, s25
+; CHECK-NEXT:    vrev32.16 q6, q4
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s25, s8
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vmovx.f16 s8, s26
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vrev32.16 q4, q4
-; CHECK-NEXT:    vins.f16 s17, s4
-; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vins.f16 s26, s4
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
-; CHECK-NEXT:    vmov.f32 s25, s17
+; CHECK-NEXT:    vins.f16 s10, s8
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s13, s25
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s26, s18
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    add sp, #80
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -664,29 +626,26 @@ entry:
 define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
 ; CHECK-LABEL: vst3_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10}
-; CHECK-NEXT:    vpush {d8, d9, d10}
-; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vins.f16 s0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vldrb.u16 q3, [r0]
 ; CHECK-NEXT:    vins.f16 s1, s11
+; CHECK-NEXT:    vmovx.f16 s2, s6
+; CHECK-NEXT:    vmovx.f16 s0, s10
 ; CHECK-NEXT:    vmovx.f16 s3, s11
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vldrb.u16 q3, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmovx.f16 s20, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s7
+; CHECK-NEXT:    vmovx.f16 s1, s1
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vins.f16 s3, s2
+; CHECK-NEXT:    vins.f16 s17, s1
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmovx.f16 s1, s15
 ; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vins.f16 s2, s1
 ; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[0]
 ; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
@@ -720,7 +679,7 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
@@ -957,11 +916,9 @@ entry:
 define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
 ; CHECK-LABEL: vst3_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vmov.f32 s14, s2
 ; CHECK-NEXT:    vmov.f32 s15, s3
 ; CHECK-NEXT:    vmov.f32 s2, s6
@@ -969,8 +926,10 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vmov.f32 s7, s9
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #32
-; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s12, s10
+; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #-16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
@@ -991,41 +950,37 @@ define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d10, d2
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s17, s15
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vmov.f64 d7, d15
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmov.f32 s20, s4
+; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
 ; CHECK-NEXT:    vmov.f32 s21, s5
 ; CHECK-NEXT:    vmov.f32 s22, s28
 ; CHECK-NEXT:    vmov.f32 s23, s29
-; CHECK-NEXT:    vmov.f64 d14, d12
+; CHECK-NEXT:    vmov.f32 s4, s8
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s28, s24
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s29, s25
-; CHECK-NEXT:    vmov.f64 d8, d7
 ; CHECK-NEXT:    vmov.f32 s30, s12
-; CHECK-NEXT:    vmov.f32 s17, s15
 ; CHECK-NEXT:    vmov.f32 s31, s13
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s4, s8
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s2, s26
-; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s5, s9
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s3, s27
-; CHECK-NEXT:    vmov.f32 s9, s15
+; CHECK-NEXT:    vmov.f32 s8, s14
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s9, s15
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1047,10 +1002,10 @@ entry:
 define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) {
 ; CHECK-LABEL: vst3_v2f32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr r2, [r0, #20]
 ; CHECK-NEXT:    vldr s0, [r0]
 ; CHECK-NEXT:    vldr s3, [r0, #4]
 ; CHECK-NEXT:    vldr s1, [r0, #8]
-; CHECK-NEXT:    ldr r2, [r0, #20]
 ; CHECK-NEXT:    vldr s2, [r0, #16]
 ; CHECK-NEXT:    ldr r0, [r0, #12]
 ; CHECK-NEXT:    strd r0, r2, [r1, #16]
@@ -1075,22 +1030,22 @@ define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s0
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s19, s13
 ; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vmov.f32 s18, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q4, [r1]
 ; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vmov.f32 s5, s15
 ; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s16, s12
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -1115,41 +1070,41 @@ define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s21, s24
-; CHECK-NEXT:    vmov.f64 d12, d4
-; CHECK-NEXT:    vmov.f64 d6, d1
-; CHECK-NEXT:    vmov.f32 s25, s28
-; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmov.f32 s24, s16
+; CHECK-NEXT:    vmov.f32 s27, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmov.f32 s21, s28
+; CHECK-NEXT:    vmov.f32 s25, s8
+; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f32 s26, s0
-; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vmov.f32 s0, s9
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s3, s30
-; CHECK-NEXT:    vmov.f32 s14, s31
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s8, s29
-; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s8, s29
 ; CHECK-NEXT:    vmov.f32 s11, s30
-; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vmov.f32 s10, s18
-; CHECK-NEXT:    vmov.f32 s6, s31
+; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s6, s31
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -1173,107 +1128,106 @@ define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    .pad #144
+; CHECK-NEXT:    sub sp, #144
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vstrw.32 q5, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #160]
+; CHECK-NEXT:    vstrw.32 q5, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s2
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #160]
-; CHECK-NEXT:    vstrw.32 q5, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
 ; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #144]
-; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s27
 ; CHECK-NEXT:    vmov.f32 s15, s11
-; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vmov.f64 d6, d3
 ; CHECK-NEXT:    vmov.f32 s13, s23
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s15, s7
 ; CHECK-NEXT:    vmov.f32 s14, s31
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d6, d12
+; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s15, s25
 ; CHECK-NEXT:    vmov.f32 s14, s8
-; CHECK-NEXT:    vmov q2, q7
-; CHECK-NEXT:    vmov.f64 d0, d10
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f64 d4, d14
+; CHECK-NEXT:    vmov.f32 s0, s20
 ; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f64 d14, d2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s22
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov.f32 s20, s5
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s23, s30
+; CHECK-NEXT:    vmov.f32 s12, s24
+; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s15, s25
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s21, s1
-; CHECK-NEXT:    vmov.f32 s23, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d12, d9
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
-; CHECK-NEXT:    vmov.f32 s27, s19
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d14
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s31, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f64 d0, d8
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s23, s10
+; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmov.f32 s29, s8
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vmov.f32 s30, s8
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f64 d8, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s19, s14
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s19
+; CHECK-NEXT:    vstrw.32 q3, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s26, s7
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s19, s6
+; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
+; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1297,14 +1251,14 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldmia r0, {s0, s1}
 ; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s2, s0
 ; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vins.f16 s4, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s1
-; CHECK-NEXT:    vmovx.f16 s10, s4
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vins.f16 s2, s10
 ; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s2, s6
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    stm r1!, {r0, r2, r3}
@@ -1328,8 +1282,6 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
 ; CHECK-NEXT:    ldrd r3, lr, [r0, #8]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
@@ -1337,30 +1289,29 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    vmov.32 q1[1], lr
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vins.f16 s8, s5
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmovx.f16 s13, s3
+; CHECK-NEXT:    vmovx.f16 s6, s0
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s4, s10
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmovx.f16 s17, s3
-; CHECK-NEXT:    vins.f16 s5, s10
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vmovx.f16 s10, s5
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vins.f16 s13, s10
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s3, s8
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vmov r0, r2, d8
+; CHECK-NEXT:    vmov r0, r2, d6
 ; CHECK-NEXT:    strd r0, r2, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -1379,65 +1330,53 @@ entry:
 define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
 ; CHECK-LABEL: vst3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vmovx.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov.f32 s4, s9
-; CHECK-NEXT:    vins.f16 s0, s20
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vins.f16 s4, s21
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov.f32 s3, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov.f32 s18, s4
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s15, s23
-; CHECK-NEXT:    vins.f16 s15, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
-; CHECK-NEXT:    vmovx.f16 s12, s22
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vins.f16 s12, s24
-; CHECK-NEXT:    vmov.f32 s25, s11
-; CHECK-NEXT:    vins.f16 s13, s23
-; CHECK-NEXT:    vmov.f32 s26, s11
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmovx.f16 s28, s13
-; CHECK-NEXT:    vins.f16 s25, s28
-; CHECK-NEXT:    vmovx.f16 s28, s26
-; CHECK-NEXT:    vins.f16 s14, s28
-; CHECK-NEXT:    vmovx.f16 s28, s9
-; CHECK-NEXT:    vmov.f32 s4, s5
-; CHECK-NEXT:    vrev32.16 q5, q5
-; CHECK-NEXT:    vins.f16 s4, s28
-; CHECK-NEXT:    vmovx.f16 s28, s10
-; CHECK-NEXT:    vins.f16 s6, s28
-; CHECK-NEXT:    vmov.f32 s26, s14
-; CHECK-NEXT:    vmov.f32 s7, s6
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s21, s8
-; CHECK-NEXT:    vmovx.f16 s8, s22
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s13, s25
-; CHECK-NEXT:    vmov.f32 s5, s21
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.f32 s1, s16
+; CHECK-NEXT:    vmovx.f16 s11, s15
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vins.f16 s8, s13
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s19
+; CHECK-NEXT:    vmov.f32 s3, s8
+; CHECK-NEXT:    vins.f16 s11, s4
+; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vmovx.f16 s8, s14
+; CHECK-NEXT:    vins.f16 s9, s15
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vins.f16 s9, s4
+; CHECK-NEXT:    vrev32.16 q3, q3
+; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s18, s4
+; CHECK-NEXT:    vmovx.f16 s4, s17
+; CHECK-NEXT:    vins.f16 s13, s4
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s14, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s6, s22
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -1458,150 +1397,121 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #128
-; CHECK-NEXT:    sub sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmovx.f16 s7, s15
+; CHECK-NEXT:    .pad #96
+; CHECK-NEXT:    sub sp, #96
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s7, s11
 ; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmovx.f16 s0, s14
+; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    vins.f16 s1, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vins.f16 s5, s15
-; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.f32 s5, s27
-; CHECK-NEXT:    vmov.f32 s6, s27
-; CHECK-NEXT:    vins.f16 s28, s12
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s11
 ; CHECK-NEXT:    vins.f16 s5, s0
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d2, d10
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s2, s8
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s0, s21
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vins.f16 s0, s9
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmovx.f16 s2, s12
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmovx.f16 s0, s20
-; CHECK-NEXT:    vmov.f32 s5, s20
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.f32 s9, s20
-; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.f32 s10, s20
-; CHECK-NEXT:    vins.f16 s9, s0
-; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmov q7, q4
 ; CHECK-NEXT:    vins.f16 s6, s0
-; CHECK-NEXT:    vmov.f32 s0, s25
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vins.f16 s0, s13
-; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmovx.f16 s2, s20
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s4, s16
+; CHECK-NEXT:    vins.f16 s4, s20
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vmovx.f16 s4, s28
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vins.f16 s0, s21
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s19, s0
+; CHECK-NEXT:    vmovx.f16 s0, s28
+; CHECK-NEXT:    vins.f16 s18, s0
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmovx.f16 s8, s24
+; CHECK-NEXT:    vmov.f32 s22, s28
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s17, s28
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vins.f16 s17, s4
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s21, s12
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vins.f16 s21, s0
+; CHECK-NEXT:    vmovx.f16 s0, s12
+; CHECK-NEXT:    vins.f16 s22, s0
+; CHECK-NEXT:    vmovx.f16 s0, s30
+; CHECK-NEXT:    vins.f16 s24, s0
+; CHECK-NEXT:    vmovx.f16 s0, s31
+; CHECK-NEXT:    vmovx.f16 s27, s11
+; CHECK-NEXT:    vins.f16 s4, s25
+; CHECK-NEXT:    vins.f16 s27, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s25, s11
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmovx.f16 s4, s25
+; CHECK-NEXT:    vmov.f32 s25, s3
+; CHECK-NEXT:    vmov.f32 s26, s31
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vins.f16 s25, s4
+; CHECK-NEXT:    vins.f16 s26, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s30, s4
+; CHECK-NEXT:    vmov.f32 s6, s18
+; CHECK-NEXT:    vrev32.16 q2, q2
+; CHECK-NEXT:    vmovx.f16 s4, s29
+; CHECK-NEXT:    vmov.f32 s3, s30
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s9, s4
+; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s29
+; CHECK-NEXT:    vmov.f32 s8, s13
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s30
+; CHECK-NEXT:    vins.f16 s14, s4
+; CHECK-NEXT:    vmov.f32 s10, s30
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vmovx.f16 s4, s13
 ; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s31, s0
-; CHECK-NEXT:    vmovx.f16 s0, s24
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vins.f16 s5, s0
-; CHECK-NEXT:    vmov.f32 s29, s24
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s30, s0
-; CHECK-NEXT:    vmovx.f16 s0, s22
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vrev32.16 q3, q3
+; CHECK-NEXT:    vmov.f32 s6, s30
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s13, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s9
-; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s0, s23
-; CHECK-NEXT:    vmovx.f16 s7, s15
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vins.f16 s5, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s23
-; CHECK-NEXT:    vmovx.f16 s16, s5
-; CHECK-NEXT:    vmov.f32 s1, s15
-; CHECK-NEXT:    vmov.f32 s2, s15
-; CHECK-NEXT:    vins.f16 s1, s16
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vins.f16 s6, s16
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vmov.f32 s20, s21
-; CHECK-NEXT:    vins.f16 s20, s16
-; CHECK-NEXT:    vmovx.f16 s16, s14
-; CHECK-NEXT:    vins.f16 s22, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s23, s22
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s30
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vmovx.f16 s12, s21
-; CHECK-NEXT:    vstr s12, [sp, #64] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s5, s1
-; CHECK-NEXT:    vrev32.16 q4, q3
-; CHECK-NEXT:    vldr s12, [sp, #64] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s22, s12
-; CHECK-NEXT:    vmovx.f16 s12, s25
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vins.f16 s8, s12
-; CHECK-NEXT:    vmovx.f16 s0, s26
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s10
-; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s10, s26
-; CHECK-NEXT:    vrev32.16 q6, q0
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vins.f16 s25, s12
-; CHECK-NEXT:    vmovx.f16 s12, s26
-; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s10
-; CHECK-NEXT:    vmov.f32 s1, s13
-; CHECK-NEXT:    vstrw.32 q7, [r1]
-; CHECK-NEXT:    vmov.f32 s2, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s14
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vmov.f32 s21, s17
-; CHECK-NEXT:    vmov.f32 s9, s25
-; CHECK-NEXT:    vmov.f32 s22, s18
-; CHECK-NEXT:    vmov.f32 s10, s26
-; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.f32 s9, s13
+; CHECK-NEXT:    vmov.f32 s4, s28
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s14, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
-; CHECK-NEXT:    add sp, #128
+; CHECK-NEXT:    vmov.f32 s7, s31
+; CHECK-NEXT:    vstrw.32 q4, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    add sp, #96
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1623,8 +1533,8 @@ entry:
 define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) {
 ; CHECK-LABEL: vst3_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmov.f64 d6, d2
 ; CHECK-NEXT:    vmov.f64 d7, d1
@@ -1653,32 +1563,28 @@ define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d6, d15
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vmov.f64 d15, d13
 ; CHECK-NEXT:    vmov.f64 d7, d1
-; CHECK-NEXT:    vmov.f64 d11, d12
+; CHECK-NEXT:    vmov.f64 d10, d2
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vmov.f64 d11, d12
+; CHECK-NEXT:    vmov.f64 d2, d8
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f64 d1, d5
-; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d2, d8
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d8, d15
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d13, d14
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d8, d5
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
index 4c939fc09e59b..ee1fe9e69c255 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
@@ -104,21 +104,21 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d4, d8
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s8, s16
 ; CHECK-NEXT:    vmov.f32 s9, s17
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s1
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s17, s13
 ; CHECK-NEXT:    vmov.f32 s18, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s19, s5
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s13
 ; CHECK-NEXT:    vmov.f32 s4, s14
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s5, s15
@@ -215,16 +215,16 @@ define <8 x double> *@vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vmov.f64 d2, d6
 ; CHECK-NEXT:    vmov.f64 d3, d0
 ; CHECK-NEXT:    vmov.f64 d0, d7
-; CHECK-NEXT:    vmov.f64 d6, d8
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d7, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f64 d6, d8
 ; CHECK-NEXT:    vmov.f64 d4, d9
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index f275049eddfc6..db4a438ae076a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -13,20 +13,20 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldm r6, {r4, r5, r6}
 ; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    ldr r0, [r0, #28]
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r6
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r0
 ; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
@@ -78,12 +78,12 @@ define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
@@ -120,55 +120,50 @@ define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #192
 ; CHECK-NEXT:    sub sp, #192
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #240]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
@@ -210,28 +205,28 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s12, s17
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vmov.f32 s24, s19
-; CHECK-NEXT:    vmov.f32 s13, s9
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s1
 ; CHECK-NEXT:    vmov.f32 s22, s0
 ; CHECK-NEXT:    vmov.f32 s26, s3
-; CHECK-NEXT:    vmov.f32 s0, s18
-; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s20, s16
 ; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s24, s19
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
@@ -385,12 +380,12 @@ define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
@@ -421,61 +416,61 @@ entry:
 define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK-LABEL: vst4_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmovx.f16 s20, s17
+; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vins.f16 s12, s0
 ; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov.f32 s3, s12
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vmovx.f16 s27, s4
 ; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmov.f32 s3, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.f32 s5, s4
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s0, s17
 ; CHECK-NEXT:    vmovx.f16 s2, s13
+; CHECK-NEXT:    vins.f16 s27, s8
+; CHECK-NEXT:    vmovx.f16 s4, s12
+; CHECK-NEXT:    vmovx.f16 s8, s16
 ; CHECK-NEXT:    vins.f16 s13, s17
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vins.f16 s27, s20
-; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vmovx.f16 s20, s16
 ; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s28, s20
 ; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov.f32 s25, s4
-; CHECK-NEXT:    vmov.f32 s22, s28
-; CHECK-NEXT:    vmovx.f16 s28, s11
-; CHECK-NEXT:    vmov.f32 s21, s4
-; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vmov.f32 s23, s27
 ; CHECK-NEXT:    vmovx.f16 s27, s7
-; CHECK-NEXT:    vins.f16 s27, s28
 ; CHECK-NEXT:    vins.f16 s7, s11
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vmovx.f16 s28, s19
+; CHECK-NEXT:    vins.f16 s27, s4
 ; CHECK-NEXT:    vmovx.f16 s26, s15
-; CHECK-NEXT:    vins.f16 s15, s19
-; CHECK-NEXT:    vins.f16 s26, s28
-; CHECK-NEXT:    vmovx.f16 s31, s6
+; CHECK-NEXT:    vmovx.f16 s4, s19
+; CHECK-NEXT:    vmov.f32 s25, s7
+; CHECK-NEXT:    vins.f16 s26, s4
+; CHECK-NEXT:    vmovx.f16 s7, s6
+; CHECK-NEXT:    vmovx.f16 s4, s10
 ; CHECK-NEXT:    vins.f16 s6, s10
-; CHECK-NEXT:    vins.f16 s31, s8
-; CHECK-NEXT:    vmov.f32 s29, s6
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vins.f16 s7, s4
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s14
 ; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vmovx.f16 s30, s14
-; CHECK-NEXT:    vmov.f32 s24, s15
 ; CHECK-NEXT:    vins.f16 s14, s18
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmov.f32 s28, s14
+; CHECK-NEXT:    vins.f16 s2, s0
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s24, s15
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmov.f32 s4, s14
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
-; CHECK-NEXT:    vstrb.8 q7, [r1, #32]
+; CHECK-NEXT:    vstrb.8 q1, [r1, #32]
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -723,25 +718,25 @@ define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d6, d8
-; CHECK-NEXT:    vmov.f64 d10, d4
-; CHECK-NEXT:    vmov.f32 s13, s17
-; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s12, s16
+; CHECK-NEXT:    vmov.f32 s13, s17
+; CHECK-NEXT:    vmov.f32 s20, s8
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s9
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vmov.f32 s5, s11
+; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s5, s11
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -766,57 +761,56 @@ define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #48
-; CHECK-NEXT:    sub sp, #48
-; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d12, d14
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s25, s29
-; CHECK-NEXT:    vmov.f32 s26, s0
-; CHECK-NEXT:    vmov.f32 s27, s1
-; CHECK-NEXT:    vmov.f32 s0, s30
-; CHECK-NEXT:    vstrw.32 q6, [r1]
-; CHECK-NEXT:    vmov.f32 s1, s31
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d2, d6
-; CHECK-NEXT:    vmov.f32 s5, s13
-; CHECK-NEXT:    vmov.f64 d14, d0
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d13, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s4, s28
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vmov.f32 s24, s30
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s25, s31
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s4, s12
+; CHECK-NEXT:    vmov.f32 s5, s13
 ; CHECK-NEXT:    vmov.f32 s8, s14
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s9, s15
-; CHECK-NEXT:    vmov.f64 d6, d0
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d1, d15
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d13, d7
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s21
 ; CHECK-NEXT:    vmov.f32 s30, s16
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
 ; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s16, s2
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s17, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s21, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s20, s26
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s21, s27
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -901,12 +895,12 @@ define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
@@ -943,55 +937,50 @@ define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #192
 ; CHECK-NEXT:    sub sp, #192
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #240]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
@@ -1033,28 +1022,28 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s12, s17
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vmov.f32 s24, s19
-; CHECK-NEXT:    vmov.f32 s13, s9
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s1
 ; CHECK-NEXT:    vmov.f32 s22, s0
 ; CHECK-NEXT:    vmov.f32 s26, s3
-; CHECK-NEXT:    vmov.f32 s0, s18
-; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s20, s16
 ; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s24, s19
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
@@ -1079,17 +1068,18 @@ entry:
 define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vst4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldmia r0, {s0, s1}
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vldr s5, [r0, #4]
 ; CHECK-NEXT:    vldr s4, [r0, #8]
 ; CHECK-NEXT:    vmovx.f16 s2, s0
-; CHECK-NEXT:    vldr s5, [r0, #12]
-; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vldr s1, [r0, #12]
+; CHECK-NEXT:    vmovx.f16 s6, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s4, s5
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s1
+; CHECK-NEXT:    vins.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s3, s6
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -1122,37 +1112,33 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.32 q1[1], r12
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s9
-; CHECK-NEXT:    vmovx.f16 s14, s4
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vins.f16 s14, s8
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vmovx.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s6, s2
 ; CHECK-NEXT:    vmovx.f16 s11, s1
-; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmovx.f16 s13, s3
-; CHECK-NEXT:    vins.f16 s11, s13
+; CHECK-NEXT:    vmovx.f16 s2, s3
 ; CHECK-NEXT:    vmovx.f16 s10, s5
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s13, s7
-; CHECK-NEXT:    vins.f16 s10, s13
+; CHECK-NEXT:    vins.f16 s11, s2
+; CHECK-NEXT:    vmovx.f16 s2, s9
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s5, s0
-; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vins.f16 s10, s2
 ; CHECK-NEXT:    vmov.f32 s9, s1
-; CHECK-NEXT:    vmov.f32 s7, s12
+; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1205,12 +1191,12 @@ define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
@@ -1241,70 +1227,61 @@ entry:
 define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    .vsave {d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vmovx.f16 s0, s29
-; CHECK-NEXT:    vins.f16 s2, s0
-; CHECK-NEXT:    vmovx.f16 s12, s25
-; CHECK-NEXT:    vstr s2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vmovx.f16 s2, s21
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s9
-; CHECK-NEXT:    vins.f16 s5, s29
-; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s25
 ; CHECK-NEXT:    vmovx.f16 s19, s4
-; CHECK-NEXT:    vmovx.f16 s12, s28
-; CHECK-NEXT:    vins.f16 s9, s25
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s20
 ; CHECK-NEXT:    vins.f16 s19, s12
-; CHECK-NEXT:    vmovx.f16 s14, s8
-; CHECK-NEXT:    vmovx.f16 s12, s24
-; CHECK-NEXT:    vins.f16 s14, s12
-; CHECK-NEXT:    vins.f16 s4, s28
-; CHECK-NEXT:    vstr s14, [sp] @ 4-byte Spill
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s14, s24
 ; CHECK-NEXT:    vmovx.f16 s15, s7
-; CHECK-NEXT:    vmovx.f16 s20, s31
-; CHECK-NEXT:    vins.f16 s8, s24
-; CHECK-NEXT:    vins.f16 s15, s20
-; CHECK-NEXT:    vmovx.f16 s20, s27
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmovx.f16 s14, s23
+; CHECK-NEXT:    vins.f16 s15, s14
 ; CHECK-NEXT:    vmovx.f16 s14, s11
-; CHECK-NEXT:    vins.f16 s7, s31
-; CHECK-NEXT:    vins.f16 s14, s20
+; CHECK-NEXT:    vmovx.f16 s1, s27
+; CHECK-NEXT:    vins.f16 s7, s23
+; CHECK-NEXT:    vins.f16 s14, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s6
-; CHECK-NEXT:    vmovx.f16 s28, s30
-; CHECK-NEXT:    vins.f16 s6, s30
-; CHECK-NEXT:    vins.f16 s23, s28
-; CHECK-NEXT:    vins.f16 s11, s27
+; CHECK-NEXT:    vmovx.f16 s1, s22
+; CHECK-NEXT:    vins.f16 s6, s22
+; CHECK-NEXT:    vins.f16 s5, s21
+; CHECK-NEXT:    vins.f16 s4, s20
+; CHECK-NEXT:    vins.f16 s23, s1
 ; CHECK-NEXT:    vmovx.f16 s22, s10
-; CHECK-NEXT:    vmovx.f16 s24, s26
-; CHECK-NEXT:    vldr s28, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s22, s24
 ; CHECK-NEXT:    vins.f16 s10, s26
+; CHECK-NEXT:    vmovx.f16 s1, s26
+; CHECK-NEXT:    vins.f16 s9, s25
+; CHECK-NEXT:    vins.f16 s8, s24
+; CHECK-NEXT:    vins.f16 s11, s27
 ; CHECK-NEXT:    vmov q6, q1
-; CHECK-NEXT:    vmov.f32 s27, s28
-; CHECK-NEXT:    vldr s28, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmov.f32 s1, s25
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s21, s6
-; CHECK-NEXT:    vmov.f32 s12, s11
-; CHECK-NEXT:    vmov.f32 s20, s10
-; CHECK-NEXT:    vstrb.8 q3, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s3, s27
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov.f32 s26, s28
-; CHECK-NEXT:    vstrb.8 q5, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s25, s4
+; CHECK-NEXT:    vmov.f32 s3, s0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s26, s12
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s25, s4
 ; CHECK-NEXT:    vmov.f32 s27, s19
+; CHECK-NEXT:    vmov.f32 s13, s7
 ; CHECK-NEXT:    vstrb.8 q6, [r1]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov.f32 s12, s11
+; CHECK-NEXT:    vmov.f32 s21, s6
+; CHECK-NEXT:    vstrb.8 q3, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s20, s10
+; CHECK-NEXT:    vstrb.8 q5, [r1, #32]
+; CHECK-NEXT:    vpop {d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -1329,15 +1306,15 @@ define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vmov.f64 d10, d6
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vmov.f64 d9, d0
+; CHECK-NEXT:    vmov.f64 d8, d4
 ; CHECK-NEXT:    vmov.f64 d11, d2
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d10, d6
 ; CHECK-NEXT:    vmov.f64 d0, d5
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f64 d2, d7
@@ -1369,32 +1346,32 @@ define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) {
 ; CHECK-NEXT:    .pad #64
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d12
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vmov.f64 d14, d12
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f64 d14, d4
 ; CHECK-NEXT:    vmov.f64 d15, d2
-; CHECK-NEXT:    vmov.f64 d2, d5
 ; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f64 d4, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f64 d10, d13
-; CHECK-NEXT:    vmov.f64 d12, d0
+; CHECK-NEXT:    vmov.f64 d2, d5
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d5, d6
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
 ; CHECK-NEXT:    vmov.f64 d13, d8
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d12, d0
 ; CHECK-NEXT:    vmov.f64 d8, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #80]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
index d055469064e52..5e3546585e94b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
@@ -56,14 +56,14 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q4, [r0]
-; CHECK-NEXT:    vmov.f64 d0, d8
 ; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s0, s16
 ; CHECK-NEXT:    vmov.f32 s2, s17
 ; CHECK-NEXT:    vand q6, q0, q5
 ; CHECK-NEXT:    vmov r0, r1, d13
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov r2, r3, d12
-; CHECK-NEXT:    vmov.f64 d0, d9
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s2, s19
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    vand q5, q0, q5


        


More information about the llvm-commits mailing list